From 4bfaea7d83ad5b30b4d856a16a3f128f4dfd903d Mon Sep 17 00:00:00 2001
From: dim <dim>
Date: Thu, 14 Feb 2008 12:08:29 +0000
Subject: [PATCH] FIX reader object init in the presence of templates

---
 pgloader/csvreader.py  | 34 +++++++++--------
 pgloader/pgloader.py   | 83 +++++++++++++++++++++++-------------------
 pgloader/reader.py     | 54 +++++++++++++++------------
 pgloader/textreader.py | 21 +++--------
 4 files changed, 100 insertions(+), 92 deletions(-)
diff --git a/pgloader/csvreader.py b/pgloader/csvreader.py
index 4366bf1..f617270 100644
--- a/pgloader/csvreader.py
+++ b/pgloader/csvreader.py
@@ -25,27 +25,29 @@ class CSVReader(DataReader):
     Read some CSV formatted data
     """
 
-    def readconfig(self, name, config):
+    def readconfig(self, config, name, template):
         """ get this reader module configuration from config file """
-        DataReader.readconfig(self, name, config)
+        DataReader.readconfig(self, config, name, template)
+
+        self._getopt('doublequote', config, name, template, False)
+        if self.doublequote is not False:
+            self.doublequote = self.doublequote == 'True'
         
-        # optionnal doublequote: defaults to escaping, not doubling
-        self.doublequote = False
-        if config.has_option(name, 'doublequote'):
-            self.trailing_sep = config.get(name, 'doublequote') == 'True'
+        self._getopt('escapechar', config, name, template, None)
+        if self.escapechar is not None:
+            self.escapechar = self.escapechar[0]
 
-        self.escapechar = None
-        if config.has_option(name, 'escapechar'):
-            self.escapechar = config.get(name, 'escapechar')[0]
+        self._getopt('quotechar', config, name, template, '"')
+        self.quotechar = self.quotechar[0]
 
-        self.quotechar = '"'
-        if config.has_option(name, 'quotechar'):
-            self.quotechar = config.get(name, 'quotechar')[0]
-
-        self.skipinitialspace = False
-        if config.has_option(name, 'skipinitialspace'):
-            self.skipinitialspace = config.get(name, 'skipinitialspace') == 'True'
+        self._getopt('skipinitialspace', config, name, template, False)
+        if self.skipinitialspace is not False:
+            self.skipinitialspace = self.skipinitialspace == 'True'
 
+        for opt in ['doublequote', 'escapechar',
+                    'quotechar', 'skipinitialspace']:
+            
+            self.log.debug("reader.readconfig %s: '%s'" % (opt, self.__dict__[opt]))
 
     def readlines(self):
         """ read data from configured file, and generate (yields) for
diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py
index 67553f4..12f8c43 100644
--- a/pgloader/pgloader.py
+++ b/pgloader/pgloader.py
@@ -64,6 +64,7 @@ class PGLoader(threading.Thread):
 
         self.template     = None
         self.use_template = None
+        self.tsection     = None
 
         self.index     = None
         self.columns   = None
@@ -88,6 +89,7 @@ class PGLoader(threading.Thread):
         if not self.template:
             # check if the section wants to use a template
             if config.has_option(name, 'use_template'):
+                self.tsection = config.get(name, 'use_template')
                 self.template = config.get(name, 'use_template')
 
                 if not config.has_section(self.template):
@@ -121,13 +123,6 @@ class PGLoader(threading.Thread):
             
             self._read_conf(name, config, db)
 
-        # force reinit of self.reader, which depends on template and
-        # specific options
-        if 'reader' in self.__dict__:
-            self.reader.__init__(self.log, self.db, self.reject,
-                                 self.filename, self.input_encoding,
-                                 self.table, self.columns)
-
         # Now reset database connection
         if not DRY_RUN:
             self.db.log = self.log
@@ -141,6 +136,7 @@ class PGLoader(threading.Thread):
     
         if DRY_RUN:
             log.info("dry run mode, not connecting to database")
+            self.db = None
             return
 
         try:
@@ -214,7 +210,7 @@ class PGLoader(threading.Thread):
                 self.log.info('rejected data in %s', self.reject.reject_data)
 
             else:
-                # needed to instanciate self.reader while in template section
+                # needed to instanciate self.reject while in template section
                 self.reject = None
 
         # optionnal local option client_encoding
@@ -487,7 +483,11 @@ class PGLoader(threading.Thread):
             self.rrqueue_size = config.getint(name, 'rrqueue_size')
 
         if self.rrqueue_size is None or self.rrqueue_size < 1:
-            self.rrqueue_size = self.db.copy_every
+            if DRY_RUN:
+                # won't be used
+                self.rrqueue_size = 1
+            else:
+                self.rrqueue_size = self.db.copy_every
             
         if not self.template:
             for opt in ('section_threads', 'split_file_reading'):
@@ -507,48 +507,50 @@ class PGLoader(threading.Thread):
         # Reader's init
         if config.has_option(name, 'format'):
             self.format = config.get(name, 'format')
+        
+        if not self.template:
+            # Only init self.reader in real section, not from
+            # template.  self.reader.readconfig() will care about
+            # reading its configuration from template and current
+            # section.
+
+            if 'format' not in self.__dict__:
+                raise PGLoader_Error, "Please configure %s.format" % name
+                
+            self.log.info("File '%s' will be read in %s format" \
+                          % (self.filename, self.format))
 
             if self.format.lower() == 'csv':
                 from csvreader import CSVReader 
                 self.reader = CSVReader(self.log, self.db, self.reject,
-                                        self.filename, self.input_encoding,
+                                        self.filename,
+                                        self.input_encoding,
                                         self.table, self.columns)
 
             elif self.format.lower() == 'text':
                 from textreader import TextReader
                 self.reader = TextReader(self.log, self.db, self.reject,
-                                         self.filename, self.input_encoding,
+                                         self.filename,
+                                         self.input_encoding,
                                          self.table, self.columns,
                                          self.newline_escapes)
 
-        if not self.template \
-               and self.format.lower() == 'text' \
-               and ('field_count' in self.reader.__dict__ \
-                    and self.reader.field_count) \
-               and ('trailing_sep' in self.reader.__dict__ \
-                    and self.reader.trailing_sep):
-            
-            # this option is not compatible with text mode when
-            # field_count is used (meaning end of line could be found
-            # in the data)
-            
-            raise PGLoader_Error, \
-                  "Can't use split_file_reading with text " +\
-                  "format when 'field_count' is used"
-
-        if not self.template:
-            self.log.info("File '%s' will be read in %s format" \
-                          % (self.filename, self.format))
-
-        if 'reader' in self.__dict__:
             self.log.debug('reader.readconfig()')
-            self.reader.readconfig(name, config)
+            self.reader.readconfig(config, name, self.tsection)
 
-        if not self.template and \
-           ('format' not in self.__dict__ or self.format is None):
-            # error only when not loading the Template part
-            self.log.Error('%s: format parameter needed', name)
-            raise PGLoader_Error
+
+            if self.split_file_reading:
+                if self.format.lower() == 'text' \
+                   and (self.reader.field_count is not None \
+                        or self.reader.trailing_sep):
+
+                    # split_file_reading is not compatible with text
+                    # mode when field_count or trailing_sep is used
+                    # (meaning end of line could be found in the data)
+
+                    raise PGLoader_Error, \
+                          "Can't use split_file_reading with text " +\
+                          "format when 'field_count' or 'trailing_sep' is used"
 
         ##
         # Some column might need reformating
@@ -1045,9 +1047,14 @@ class PGLoader(threading.Thread):
         
         if self.reject is not None:
             self.errors = self.reject.errors
+
+        if DRY_RUN:
+            self.commited_rows = 0
+        else:
+            self.commited_rows = self.db.commited_rows
             
         for x in [self.table, self.duration,
-                  self.db.commited_rows, self.errors]:
+                  self.commited_rows, self.errors]:
             self.stats.append(x)
 
         # then show up some stats
diff --git a/pgloader/reader.py b/pgloader/reader.py
index b198b53..0757ef7 100644
--- a/pgloader/reader.py
+++ b/pgloader/reader.py
@@ -42,11 +42,14 @@ class DataReader:
         self.start = None
         self.end   = None
 
-    def readconfig(self, name, config):
+    def readconfig(self, config, name, template):
         """ read configuration section for common options
 
         name is configuration section name, conf the ConfigParser object
 
+        template is the (maybe None) template section name declared in
+        the use_template configuration option.
+
         specific option reading code is to be found on subclasses
         which implements read data parsing code.
 
@@ -55,35 +58,40 @@ class DataReader:
 
         if not DRY_RUN:
             # optionnal null and empty_string per table parameters
-            if config.has_option(name, 'null'):
-                self.db.null = parse_config_string(config.get(name, 'null'))
-            else:
-                if 'null' not in self.__dict__:
-                    self.db.null = NULL
+            self._getopt('null', config, name, template, NULL)
+            self.db.null = parse_config_string(self.null)
 
-            if config.has_option(name, 'empty_string'):
-                self.db.empty_string = parse_config_string(
-                    config.get(name, 'empty_string'))
-            else:
-                if 'empty_string' not in self.__dict__:
-                    self.db.empty_string = EMPTY_STRING
+            self._getopt('empty_string', config, name, template, EMPTY_STRING)
+            self.db.empty_string = parse_config_string(self.empty_string)
 
-        # optionnal field separator, could be defined from template
-        if 'field_sep' not in self.__dict__:
-            self.field_sep = FIELD_SEP
-        
-        if config.has_option(name, 'field_sep'):
-            self.field_sep = config.get(name, 'field_sep')
-
-            if not DRY_RUN:
-                if self.db.copy_sep is None:
-                    self.db.copy_sep = self.field_sep
+        self._getopt('field_sep', config, name, template, FIELD_SEP)
+        if not DRY_RUN:
+            if self.db.copy_sep is None:
+                self.db.copy_sep = self.field_sep
 
         if not DRY_RUN:
             self.log.debug("reader.readconfig null: '%s'" % self.db.null)
             self.log.debug("reader.readconfig empty_string: '%s'",
                            self.db.empty_string)
-            self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep)
+            
+        self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep)
+
+    def _getopt(self, option, config, section, template, default = None):
+        """ Init given configuration option """
+
+        if config.has_option(section, option):
+            self.__dict__[option] = config.get(section, option)
+            self.log.debug("reader._getopt %s from %s is '%s'" % (option, section, self.__dict__[option]))
+
+        elif template and config.has_option(template, option):
+            self.__dict__[option] = config.get(template, option)
+            self.log.debug("reader._getopt %s from %s is '%s'" % (option, template, self.__dict__[option]))
+
+        elif option not in self.__dict__:
+            self.log.debug("reader._getopt %s defaults to '%s'" % (option, default))
+            self.__dict__[option] = default
+
+        return self.__dict__[option]
 
     def readlines(self):
         """ read data from configured file, and generate (yields) for
diff --git a/pgloader/textreader.py b/pgloader/textreader.py
index f7a70df..882fc4b 100644
--- a/pgloader/textreader.py
+++ b/pgloader/textreader.py
@@ -39,27 +39,18 @@ class TextReader(DataReader):
         if 'newline_escapes' not in self.__dict__:
             self.newline_escapes = newline_escapes
 
-    def readconfig(self, name, config):
+    def readconfig(self, config, name, template):
         """ get this reader module configuration from config file """
-        DataReader.readconfig(self, name, config)
+        DataReader.readconfig(self, config, name, template)
 
         # this will be called twice if templates are in used, so we
         # have to protect ourselves against removing already read
         # configurations while in second run.
 
-        # optionnal number of columns per line
-        if 'field_count' not in self.__dict__:
-            self.field_count = None
-            
-        if config.has_option(name, 'field_count'):
-            self.field_count = config.getint(name, 'field_count')
-
-        # optionnal trailing separator option
-        if 'trailing_sep' not in self.__dict__:
-            self.trailing_sep = False
-            
-        if config.has_option(name, 'trailing_sep'):
-            self.trailing_sep = config.get(name, 'trailing_sep') == 'True'
+        self._getopt('field_count', config, name, template, None)
+        self._getopt('trailing_sep', config, name, template, False)
+        if self.trailing_sep is not False:
+            self.trailing_sep = self.trailing_sep == 'True'
 
         self.log.debug('reader.readconfig: field_count %s', self.field_count)
         self.log.debug('reader.readconfig: trailing_sep %s', self.trailing_sep)