diff --git a/debian/changelog b/debian/changelog index 35e1ec1..64b04ce 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +pgloader (2.2.5-dev) unstable; urgency=low + + * Configuration now supports templates + * Command line option for setting --reformat_path, -R + * + + -- Dimitri Fontaine Mon, 26 Nov 2007 21:53:11 +0100 + pgloader (2.2.4) unstable; urgency=low * Reformat modules to change input on-the-fly diff --git a/pgloader.py b/pgloader.py index 6c557e5..c662be3 100644 --- a/pgloader.py +++ b/pgloader.py @@ -101,6 +101,10 @@ def parse_options(): default = None, help = "input files encoding") + parser.add_option("-R", "--reformat_path", dest = "reformat_path", + default = None, + help = "PATH where to find reformat python modules") + (opts, args) = parser.parse_args() if opts.version: @@ -129,10 +133,13 @@ def parse_options(): print "Error: Can't be verbose and quiet at the same time!" sys.exit(1) + # if debug, then verbose + if opts.debug: + opts.verbose = True + pgloader.options.DRY_RUN = opts.dryrun pgloader.options.DEBUG = opts.debug - # if debug, then verbose - pgloader.options.VERBOSE = opts.verbose or opts.debug + pgloader.options.VERBOSE = opts.verbose pgloader.options.QUIET = opts.quiet pgloader.options.SUMMARY = opts.summary pgloader.options.PEDANTIC = opts.pedantic @@ -146,6 +153,9 @@ def parse_options(): pgloader.options.INPUT_ENCODING = opts.encoding + if opts.reformat_path: + pgloader.options.REFORMAT_PATH = opts.reformat_path + return opts.config, args def parse_config(conffile): @@ -227,19 +237,10 @@ def parse_config(conffile): config.get(section, 'empty_string')) if config.has_option(section, 'reformat_path'): - import os.path - reformat_path = [] - tmp_rpath = config.get(section, 'reformat_path') - - for p in tmp_rpath.split(':'): - if os.path.exists(p): - reformat_path.append(p) - else: - print 'Error: reformat_path %s does not exists, ignored'%p - - pgloader.options.REFORMAT_PATH = reformat_path - else: - pgloader.reformat_path = None + # command line value is prefered to config format one + if not pgloader.options.REFORMAT_PATH: + rpath = config.get(section, 'reformat_path') + pgloader.options.REFORMAT_PATH = rpath except Exception, error: print "Error: Could not initialize PostgreSQL connection:" @@ -361,6 +362,25 @@ def load_data(): # now init db connection config, dbconn = parse_config(conffile) + from pgloader.tools import read_path, check_path + from pgloader.options import VERBOSE + import pgloader.options + rpath = read_path(pgloader.options.REFORMAT_PATH, check = False) + crpath = check_path(rpath, VERBOSE) + + if not crpath: + # don't check same path entries twice + default_rpath = set(crpath) \ + - set(pgloader.options.DEFAULT_REFORMAT_PATH) + + pgloader.options.REFORMAT_PATH = check_path(default_rpath, VERBOSE) + else: + pgloader.options.REFORMAT_PATH = rpath + + if VERBOSE: + print 'Notice: Reformat path is', pgloader.options.REFORMAT_PATH + print + # load some pgloader package modules from pgloader.options import VERBOSE, DEBUG, QUIET, SUMMARY from pgloader.options import DRY_RUN, PEDANTIC, VACUUM @@ -392,11 +412,17 @@ def load_data(): sections.sort() for s in sections: try: + if VERBOSE: + print + pgloader = PGLoader(s, config, dbconn) if not pgloader.template: pgloader.run() summary[s] = (pgloader.table,) + pgloader.summary() + else: + if VERBOSE: + print "Skipping section %s, which is a template" % s except PGLoader_Error, e: if e == '': diff --git a/pgloader/__init__.py b/pgloader/__init__.py index e69de29..269872b 100644 --- a/pgloader/__init__.py +++ b/pgloader/__init__.py @@ -0,0 +1,3 @@ +""" +pgloader package, offering modules to implement pgloader. +""" diff --git a/pgloader/options.py b/pgloader/options.py index 1ba180f..8ef7310 100644 --- a/pgloader/options.py +++ b/pgloader/options.py @@ -2,7 +2,7 @@ # # Some common options, for each module to get them -PGLOADER_VERSION = '2.2.4' +PGLOADER_VERSION = '2.2.5-devel' INPUT_ENCODING = None PG_CLIENT_ENCODING = 'latin9' @@ -32,4 +32,5 @@ FROM_ID = None UDC_PREFIX = 'udc_' -REFORMAT_PATH = ['/usr/share/pgloader/reformat'] +REFORMAT_PATH = None +DEFAULT_REFORMAT_PATH = ['/usr/share/pgloader/reformat'] diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index 7a86ff0..2320649 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -54,11 +54,9 @@ class PGLoader: # just skip it here if VERBOSE: - print - print "[%s] skip template configuration" % self.name + print "[%s] is a template" % self.name if not self.template and VERBOSE: - print print "[%s] parse configuration" % self.name if not self.template: @@ -83,11 +81,17 @@ class PGLoader: # now load specific configuration if VERBOSE: - print print "Reading configuration from section [%s]" % name self.__read_conf__(name, config, db) + # force reinit of self.reader, which depends on template and + # specific options + if 'reader' in self.__dict__: + self.reader.__init__(self.db, self.reject, + self.filename, self.input_encoding, + self.table, self.columns) + if DEBUG: print '%s init done' % name print @@ -181,11 +185,15 @@ class PGLoader: print 'columns', self.columns print 'blob_columns', self.blob_cols - if self.name == name and not self.columns: - print 'Error: %s has no columns defined' % name - self.config_errors += 1 + if self.columns is None: + if not self.template: + print 'Error: %s has no columns defined' % name + self.config_errors += 1 - self.columns = [] + else: + # non critical error, and code thereafter wants to use + # self.columns as a list + self.columns = [] ## # The config section can also provide user-defined colums @@ -331,9 +339,7 @@ class PGLoader: self.columnlist = [n for (n, pos) in self.columns] if DEBUG: - #print "columns", self.columns print "only_cols", self.only_cols - #print "udcs", self.udcs print "columnlist", self.columnlist ## @@ -364,47 +370,28 @@ class PGLoader: if config.has_option(name, 'format'): self.format = config.get(name, 'format') - if 'reader' not in self.__dict__: - if DEBUG: - print 'READER INIT' - - if self.format.lower() == 'csv': - from csvreader import CSVReader - self.reader = CSVReader(self.db, self.reject, - self.filename, - self.input_encoding, - self.table, self.columns) + if self.format.lower() == 'csv': + from csvreader import CSVReader + self.reader = CSVReader(self.db, self.reject, + self.filename, self.input_encoding, + self.table, self.columns) - elif self.format.lower() == 'text': - from textreader import TextReader - self.reader = TextReader(self.db, self.reject, - self.filename, - self.input_encoding, - self.table, self.columns, - self.newline_escapes) + elif self.format.lower() == 'text': + from textreader import TextReader + self.reader = TextReader(self.db, self.reject, + self.filename, self.input_encoding, + self.table, self.columns, + self.newline_escapes) - self.reader.readconfig(name, config) + if 'reader' in self.__dict__: + if DEBUG: + print 'reader.readconfig()' + self.reader.readconfig(name, config) if not self.template and self.format is None: # error only when not loading the Template part print 'Error: %s: format parameter needed' % name raise PGLoader_Error - else: - if DEBUG: - print 'MANUAL REINIT OF READER' - self.reader.reject = self.reject - self.reader.filename = self.filename - self.reader.input_encoding = self.input_encoding - self.reader.newline_escapes = self.newline_escapes - self.reader.readconfig(name, config) - - print 'BLURPS', self.reader.trailing_sep - -## ## -## # parse the reader specific section options -## if not self.template: -## self.reader.readconfig(name, config) -## print 'BLURPS', self.reader.trailing_sep ## # Some column might need reformating @@ -444,8 +431,10 @@ class PGLoader: print 'Error: %s failed to import reformat module "%s"' \ % (name, r_module) print ' from %s' % str(REFORMAT_PATH) + print ' %s' % e self.config_errors += 1 + if module: if r_function in module.__dict__: self.reformat.append((r_colname, diff --git a/pgloader/reader.py b/pgloader/reader.py index 2c31d06..e9a4d23 100644 --- a/pgloader/reader.py +++ b/pgloader/reader.py @@ -22,6 +22,9 @@ class DataReader: def __init__(self, db, reject, filename, input_encoding, table, columns): """ init internal variables """ + if DEBUG: + print 'reader __init__', filename, table, columns + self.db = db self.filename = filename self.input_encoding = input_encoding @@ -32,7 +35,7 @@ class DataReader: if self.input_encoding is None: if INPUT_ENCODING is not None: self.input_encoding = INPUT_ENCODING - + def readconfig(self, name, config): """ read configuration section for common options @@ -67,8 +70,9 @@ class DataReader: self.db.copy_sep = self.field_sep if DEBUG and not DRY_RUN: - print "null: '%s'" % self.db.null - print "empty_string: '%s'" % self.db.empty_string + print "reader.readconfig null: '%s'" % self.db.null + print "reader.readconfig empty_string: '%s'" \ + % self.db.empty_string def readlines(self): """ read data from configured file, and generate (yields) for diff --git a/pgloader/textreader.py b/pgloader/textreader.py index 2c24d49..151ed30 100644 --- a/pgloader/textreader.py +++ b/pgloader/textreader.py @@ -31,28 +31,39 @@ class TextReader(DataReader): """ def __init__(self, db, reject, filename, input_encoding, - table, columns, newline_escapes): + table, columns, newline_escapes = None): """ init textreader with a newline_escapes parameter """ DataReader.__init__(self, db, reject, filename, input_encoding, table, columns) - self.newline_escapes = newline_escapes - + if 'newline_escapes' not in self.__dict__: + self.newline_escapes = newline_escapes def readconfig(self, name, config): """ get this reader module configuration from config file """ DataReader.readconfig(self, name, config) + # this will be called twice if templates are in used, so we + # have to protect ourselves against removing already read + # configurations while in second run. + # optionnal number of columns per line - self.field_count = None + if 'field_count' not in self.__dict__: + self.field_count = None + if config.has_option(name, 'field_count'): self.field_count = config.getint(name, 'field_count') # optionnal trailing separator option - self.trailing_sep = False + if 'trailing_sep' not in self.__dict__: + self.trailing_sep = False + if config.has_option(name, 'trailing_sep'): self.trailing_sep = config.get(name, 'trailing_sep') == 'True' + if DEBUG: + print 'reader.readconfig: field_count', self.field_count + print 'reader.readconfig: trailing_sep', self.trailing_sep def readlines(self): """ read data from configured file, and generate (yields) for diff --git a/pgloader/tools.py b/pgloader/tools.py index c1c2457..3047b1a 100644 --- a/pgloader/tools.py +++ b/pgloader/tools.py @@ -116,3 +116,34 @@ def parse_config_string(str): +def read_path(strpath, verbose = False, path = [], check = True): + """ read a path configuration element, discarding non-existing entries """ + import os.path + + for p in strpath.split(':'): + path.append(p) + + if check: + return check_path(path, verbose) + else: + return path + +def check_path(path, verbose = False): + """ removes non existant and non {directories, symlink} entries from path + """ + path_ok = [] + + for p in path: + if os.path.exists(p): + if os.path.isdir(p) or \ + (os.path.islink(p) and os.path.isdir(os.path.realpath(p))): + path_ok.append(p) + else: + if verbose: + print "Warning: path entry '%s' " % p + \ + "is not a directory or does not link to a directory" + else: + if verbose: + print "Warning: path entry '%s' does not exists, ignored" % p + + return path_ok diff --git a/reformat/mysql.py b/reformat/mysql.py index e44ae99..78aa67e 100644 --- a/reformat/mysql.py +++ b/reformat/mysql.py @@ -2,7 +2,6 @@ # # pgloader mysql reformating module # -from pgloader.tools import PGLoader_Error def timestamp(reject, input): """ Reformat str as a PostgreSQL timestamp @@ -12,7 +11,7 @@ def timestamp(reject, input): """ if len(input) != 14: e = "MySQL timestamp reformat input too short: %s" % input - raise PGLoader_Error, e + reject.log(e, input) year = input[0:4] month = input[4:6] @@ -21,4 +20,4 @@ def timestamp(reject, input): minute = input[10:12] seconds = input[12:14] - return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, month, seconds) + return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, minute, seconds)