Configuration now knows about templates, and command line about a reformat

path option, --reformat_path= or -R.
2026-05-05 10:56:10 +02:00 · 2007-11-26 21:33:24 +00:00 · 2007-11-26 21:33:24 +00:00 · 1e7a16b869
commit 1e7a16b869
parent 3a8ac261c8
9 changed files with 144 additions and 72 deletions
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,11 @@
+pgloader (2.2.5-dev) unstable; urgency=low
+
+  * Configuration now supports templates
+  * Command line option for setting --reformat_path, -R
+  *
+
+ -- Dimitri Fontaine <dim@tapoueh.org>  Mon, 26 Nov 2007 21:53:11 +0100
+
 pgloader (2.2.4) unstable; urgency=low

  * Reformat modules to change input on-the-fly
--- a/pgloader.py
+++ b/pgloader.py
@ -101,6 +101,10 @@ def parse_options():
                      default = None,
                      help    = "input files encoding")

+    parser.add_option("-R", "--reformat_path", dest = "reformat_path",
+                      default = None,
+                      help    = "PATH where to find reformat python modules")
+
    (opts, args) = parser.parse_args()

    if opts.version:
@ -129,10 +133,13 @@ def parse_options():
        print "Error: Can't be verbose and quiet at the same time!"
        sys.exit(1)

+    # if debug, then verbose
+    if opts.debug:
+        opts.verbose = True
+
    pgloader.options.DRY_RUN    = opts.dryrun
    pgloader.options.DEBUG      = opts.debug
-    # if debug, then verbose
-    pgloader.options.VERBOSE    = opts.verbose or opts.debug
+    pgloader.options.VERBOSE    = opts.verbose
    pgloader.options.QUIET      = opts.quiet
    pgloader.options.SUMMARY    = opts.summary    
    pgloader.options.PEDANTIC   = opts.pedantic
@ -146,6 +153,9 @@ def parse_options():

    pgloader.options.INPUT_ENCODING = opts.encoding

+    if opts.reformat_path:
+        pgloader.options.REFORMAT_PATH = opts.reformat_path
+
    return opts.config, args

 def parse_config(conffile):
@ -227,19 +237,10 @@ def parse_config(conffile):
                config.get(section, 'empty_string'))

        if config.has_option(section, 'reformat_path'):
-            import os.path
-            reformat_path = []
-            tmp_rpath = config.get(section, 'reformat_path')
-
-            for p in tmp_rpath.split(':'):
-                if os.path.exists(p):
-                    reformat_path.append(p)
-                else:
-                    print 'Error: reformat_path %s does not exists, ignored'%p
-
-            pgloader.options.REFORMAT_PATH = reformat_path
-        else:
-            pgloader.reformat_path = None
+            # command line value is prefered to config format one
+            if not pgloader.options.REFORMAT_PATH:
+                rpath = config.get(section, 'reformat_path')
+                pgloader.options.REFORMAT_PATH = rpath

    except Exception, error:
        print "Error: Could not initialize PostgreSQL connection:"
@ -361,6 +362,25 @@ def load_data():
    # now init db connection
    config, dbconn = parse_config(conffile)

+    from pgloader.tools   import read_path, check_path
+    from pgloader.options import VERBOSE
+    import pgloader.options
+    rpath  = read_path(pgloader.options.REFORMAT_PATH, check = False)
+    crpath = check_path(rpath, VERBOSE)
+
+    if not crpath:
+        # don't check same path entries twice
+        default_rpath = set(crpath) \
+                        - set(pgloader.options.DEFAULT_REFORMAT_PATH)
+        
+        pgloader.options.REFORMAT_PATH = check_path(default_rpath, VERBOSE)
+    else:
+        pgloader.options.REFORMAT_PATH = rpath
+
+    if VERBOSE:
+        print 'Notice: Reformat path is', pgloader.options.REFORMAT_PATH
+        print
+
    # load some pgloader package modules
    from pgloader.options  import VERBOSE, DEBUG, QUIET, SUMMARY
    from pgloader.options  import DRY_RUN, PEDANTIC, VACUUM
@ -392,11 +412,17 @@ def load_data():
    sections.sort()
    for s in sections:
        try:
+            if VERBOSE:
+                print
+                
            pgloader = PGLoader(s, config, dbconn)
            
            if not pgloader.template:
                pgloader.run()            
                summary[s] = (pgloader.table,) + pgloader.summary()
+            else:
+                if VERBOSE:
+                    print "Skipping section %s, which is a template" % s
                
        except PGLoader_Error, e:
            if e == '':
--- a/pgloader/init.py
+++ b/pgloader/init.py
@ -0,0 +1,3 @@
+"""
+pgloader package, offering modules to implement pgloader.
+"""
--- a/pgloader/options.py
+++ b/pgloader/options.py
@ -2,7 +2,7 @@
 #
 # Some common options, for each module to get them

-PGLOADER_VERSION = '2.2.4'
+PGLOADER_VERSION = '2.2.5-devel'

 INPUT_ENCODING     = None
 PG_CLIENT_ENCODING = 'latin9'
@ -32,4 +32,5 @@ FROM_ID    = None

 UDC_PREFIX = 'udc_'

-REFORMAT_PATH = ['/usr/share/pgloader/reformat']
+REFORMAT_PATH = None
+DEFAULT_REFORMAT_PATH = ['/usr/share/pgloader/reformat']
--- a/pgloader/pgloader.py
+++ b/pgloader/pgloader.py
@ -54,11 +54,9 @@ class PGLoader:
            
            # just skip it here
            if VERBOSE:
-                print
-                print "[%s] skip template configuration" % self.name
+                print "[%s] is a template" % self.name

        if not self.template and VERBOSE:
-            print
            print "[%s] parse configuration" % self.name

        if not self.template:
@ -83,11 +81,17 @@ class PGLoader:

            # now load specific configuration
            if VERBOSE:
-                print
                print "Reading configuration from section [%s]" % name
            
            self.__read_conf__(name, config, db)

+        # force reinit of self.reader, which depends on template and
+        # specific options
+        if 'reader' in self.__dict__:
+            self.reader.__init__(self.db, self.reject,
+                                 self.filename, self.input_encoding,
+                                 self.table, self.columns)
+
        if DEBUG:
            print '%s init done' % name
            print
@ -181,11 +185,15 @@ class PGLoader:
            print 'columns', self.columns
            print 'blob_columns', self.blob_cols

-        if self.name == name and not self.columns:
-            print 'Error: %s has no columns defined' % name
-            self.config_errors += 1
+        if self.columns is None:
+            if not self.template:
+                print 'Error: %s has no columns defined' % name
+                self.config_errors += 1

-        self.columns = []
+            else:
+                # non critical error, and code thereafter wants to use
+                # self.columns as a list
+                self.columns = []

        ##
        # The config section can also provide user-defined colums
@ -331,9 +339,7 @@ class PGLoader:
                self.columnlist = [n for (n, pos) in self.columns]

        if DEBUG:
-            #print "columns", self.columns
            print "only_cols", self.only_cols
-            #print "udcs", self.udcs
            print "columnlist", self.columnlist

        ##
@ -364,47 +370,28 @@ class PGLoader:
        if config.has_option(name, 'format'):
            self.format = config.get(name, 'format')

-            if 'reader' not in self.__dict__:
-                if DEBUG:
-                    print 'READER INIT'
-                
-                if self.format.lower() == 'csv':
-                    from csvreader import CSVReader 
-                    self.reader = CSVReader(self.db, self.reject,
-                                            self.filename,
-                                            self.input_encoding,
-                                            self.table, self.columns)
+            if self.format.lower() == 'csv':
+                from csvreader import CSVReader 
+                self.reader = CSVReader(self.db, self.reject,
+                                        self.filename, self.input_encoding,
+                                        self.table, self.columns)

-                elif self.format.lower() == 'text':
-                    from textreader import TextReader
-                    self.reader = TextReader(self.db, self.reject,
-                                             self.filename,
-                                             self.input_encoding,
-                                             self.table, self.columns,
-                                             self.newline_escapes)
+            elif self.format.lower() == 'text':
+                from textreader import TextReader
+                self.reader = TextReader(self.db, self.reject,
+                                         self.filename, self.input_encoding,
+                                         self.table, self.columns,
+                                         self.newline_escapes)

-                self.reader.readconfig(name, config)
+        if 'reader' in self.__dict__:
+            if DEBUG:
+                print 'reader.readconfig()'
+            self.reader.readconfig(name, config)

        if not self.template and self.format is None:
            # error only when not loading the Template part
            print 'Error: %s: format parameter needed' % name
            raise PGLoader_Error
-        else:
-            if DEBUG:
-                print 'MANUAL REINIT OF READER'
-            self.reader.reject          = self.reject
-            self.reader.filename        = self.filename
-            self.reader.input_encoding  = self.input_encoding
-            self.reader.newline_escapes = self.newline_escapes
-            self.reader.readconfig(name, config)
-
-            print 'BLURPS', self.reader.trailing_sep
-
-##         ##
-##         # parse the reader specific section options
-##         if not self.template:
-##             self.reader.readconfig(name, config)
-##             print 'BLURPS', self.reader.trailing_sep

        ##
        # Some column might need reformating
@ -444,8 +431,10 @@ class PGLoader:
                    print 'Error: %s failed to import reformat module "%s"' \
                          % (name, r_module)
                    print '       from %s' % str(REFORMAT_PATH)
+                    print '       %s' % e
                    self.config_errors += 1

+
                if module:
                    if r_function in module.__dict__:
                        self.reformat.append((r_colname,
--- a/pgloader/reader.py
+++ b/pgloader/reader.py
@ -22,6 +22,9 @@ class DataReader:

    def __init__(self, db, reject, filename, input_encoding, table, columns):
        """ init internal variables """
+        if DEBUG:
+            print 'reader __init__', filename, table, columns
+        
        self.db        = db
        self.filename  = filename
        self.input_encoding = input_encoding
@ -32,7 +35,7 @@ class DataReader:
        if self.input_encoding is None:
            if INPUT_ENCODING is not None:
                self.input_encoding = INPUT_ENCODING
-                                
+
    def readconfig(self, name, config):
        """ read configuration section for common options

@ -67,8 +70,9 @@ class DataReader:
                    self.db.copy_sep = self.field_sep

        if DEBUG and not DRY_RUN:
-            print "null: '%s'" % self.db.null
-            print "empty_string: '%s'" %  self.db.empty_string
+            print "reader.readconfig null: '%s'" % self.db.null
+            print "reader.readconfig empty_string: '%s'" \
+                  %  self.db.empty_string

    def readlines(self):
        """ read data from configured file, and generate (yields) for
--- a/pgloader/textreader.py
+++ b/pgloader/textreader.py
@ -31,28 +31,39 @@ class TextReader(DataReader):
    """

    def __init__(self, db, reject, filename, input_encoding,
-                 table, columns, newline_escapes):
+                 table, columns, newline_escapes = None):
        """ init textreader with a newline_escapes parameter """
        DataReader.__init__(self, db, reject,
                            filename, input_encoding, table, columns)

-        self.newline_escapes = newline_escapes
-
+        if 'newline_escapes' not in self.__dict__:
+            self.newline_escapes = newline_escapes

    def readconfig(self, name, config):
        """ get this reader module configuration from config file """
        DataReader.readconfig(self, name, config)

+        # this will be called twice if templates are in used, so we
+        # have to protect ourselves against removing already read
+        # configurations while in second run.
+
        # optionnal number of columns per line
-        self.field_count = None
+        if 'field_count' not in self.__dict__:
+            self.field_count = None
+            
        if config.has_option(name, 'field_count'):
            self.field_count = config.getint(name, 'field_count')

        # optionnal trailing separator option
-        self.trailing_sep = False
+        if 'trailing_sep' not in self.__dict__:
+            self.trailing_sep = False
+            
        if config.has_option(name, 'trailing_sep'):
            self.trailing_sep = config.get(name, 'trailing_sep') == 'True'

+        if DEBUG:
+            print 'reader.readconfig: field_count', self.field_count
+            print 'reader.readconfig: trailing_sep', self.trailing_sep

    def readlines(self):
        """ read data from configured file, and generate (yields) for
--- a/pgloader/tools.py
+++ b/pgloader/tools.py
@ -116,3 +116,34 @@ def parse_config_string(str):

            
    
+def read_path(strpath, verbose = False, path = [], check = True):
+    """ read a path configuration element, discarding non-existing entries """
+    import os.path
+
+    for p in strpath.split(':'):
+        path.append(p)
+
+    if check:
+        return check_path(path, verbose)
+    else:
+        return path
+
+def check_path(path, verbose = False):
+    """ removes non existant and non {directories, symlink} entries from path
+    """
+    path_ok = []
+
+    for p in path:
+        if os.path.exists(p):
+            if os.path.isdir(p) or \
+                   (os.path.islink(p) and os.path.isdir(os.path.realpath(p))):
+                path_ok.append(p)
+            else:
+                if verbose:
+                    print "Warning: path entry '%s' " % p + \
+                          "is not a directory or does not link to a directory"
+        else:
+            if verbose:
+                print "Warning: path entry '%s' does not exists, ignored" % p
+
+    return path_ok
--- a/reformat/mysql.py
+++ b/reformat/mysql.py
@ -2,7 +2,6 @@
 #
 # pgloader mysql reformating module
 #
-from pgloader.tools import PGLoader_Error

 def timestamp(reject, input):
    """ Reformat str as a PostgreSQL timestamp
@ -12,7 +11,7 @@ def timestamp(reject, input):
    """
    if len(input) != 14:
        e = "MySQL timestamp reformat input too short: %s" % input
-        raise PGLoader_Error, e
+        reject.log(e, input)
    
    year    = input[0:4]
    month   = input[4:6]
@ -21,4 +20,4 @@ def timestamp(reject, input):
    minute  = input[10:12]
    seconds = input[12:14]
    
-    return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, month, seconds)
+    return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, minute, seconds)