diff --git a/Makefile b/Makefile index fbc8918..b6b6d36 100644 --- a/Makefile +++ b/Makefile @@ -9,12 +9,14 @@ exdir = $(DESTDIR)/usr/share/doc/pgloader pgloader = pgloader.py examples = examples libs = $(wildcard pgloader/*.py) +refm = $(wildcard reformat/*.py) install: install -m 755 $(pgloader) $(libdir) install -m 755 -d $(libdir)/pgloader cp -a $(libs) $(libdir)/pgloader + cp -a $(refm) $(libdir)/reformat cp -a $(examples) $(exdir) html: $(DOCS) diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index dfbb86a..06511dc 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -168,9 +168,16 @@ class PGLoader: [n for (n, v) in self.udcs] copy_columns = config.get(name, 'copy_columns').split(',') - self.copy_columns = [x.strip() - for x in copy_columns - if x.strip() in namelist] + + self.copy_columns = [] + for x in copy_columns: + x = x.strip(' \n\r') + if x not in namelist: + print 'Error: "%s" not in %s column list, ' \ + % (x, name) +\ + 'including user defined columns' + else: + self.copy_columns.append(x) if len(self.copy_columns) != len(copy_columns): print 'Error: %s.copy_columns refers to ' % name +\ @@ -313,6 +320,57 @@ class PGLoader: print 'Error: %s: format parameter needed' % name raise PGLoader_Error + ## + # Some column might need reformating + if config.has_option(name, 'reformat'): + self._parse_fields('c_reformat', config.get(name, 'reformat'), + btype = True, argtype = 'string') + else: + self.reformat = None + + if DEBUG: + print 'reformat:', self.c_reformat + + # check the configure reformating is available + if self.c_reformat: + import imp + self.reformat = [] + + for r_colname, r_module, r_function in self.c_reformat: + if r_colname not in self.columnlist: + print 'Error: %s.reformat refers to unknown column %s' \ + % ( name, r_colname ) + self.config_errors += 1 + + # load the given module name and function + module = None + try: + fp, pathname, description = \ + imp.find_module(r_module, + ['reformat', + # explicit debian packaging support + '/usr/share/pgloader/reformat']) + + module = imp.load_module(r_module, + fp, pathname, description) + + except ImportError, e: + print 'Error: %s failed to import reformat module %s' \ + % (name, r_module) + self.config_errors += 1 + + if module: + if r_function in module.__dict__: + self.reformat.append((r_colname, + module.__dict__[r_function])) + else: + print 'Error: reformat module %s has no %s function'%\ + (r_module, r_function) + self.config_errors += 1 + + if DEBUG: + print 'reformat', self.reformat + ## # parse the reader specific section options self.reader.readconfig(name, config) @@ -382,13 +440,17 @@ class PGLoader: # arg is the target column index try: arg = int(arg) - except ValueError: - raise PGLoader_Error + except ValueError, e: + raise PGLoader_Error, e elif argtype == 'char': # arg is an escape char if len(arg) > 1: - raise PGLoader_Error + raise PGLoader_Error, 'more than one character for char' + + elif argtype == 'string': + # accept all inputs + pass return arg @@ -474,15 +536,40 @@ class PGLoader: def data_import(self): """ import CSV or TEXT data, using COPY """ + + # some more practical data format of internals + ddict = dict(self.columns) + if self.reformat: + drefc = dict(self.reformat) + + if self.udcs: + dudcs = dict(self.udcs) + for line, columns in self.reader.readlines(): if self.blob_cols is not None: columns, rowids = self.read_blob(line, columns) - data = columns + if self.reformat: + refc = dict(self.reformat) + data = [] + for cname, cpos in self.columns: + if cname in drefc: + # reformat the column value + data.append(drefc[cname](self.reject, + columns[cpos-1])) + else: + data.append(columns[cpos-1]) + if DEBUG: + print 'reformat' + print 'columns', columns + print 'data ', data + + # we want next steps to take reformated data as input + columns = data + if self.udcs: dudcs = dict(self.udcs) - ddict = dict(self.columns) data = [] for c in self.copy_columns: if c in ddict: @@ -491,6 +578,7 @@ class PGLoader: data.append(dudcs[c]) if DEBUG: + print 'udcs' print 'columns', columns print 'data ', data @@ -513,9 +601,13 @@ class PGLoader: else: data = [columns[i-1] for i in self.only_cols] + if not self.reformat and not self.udcs and not self.col_mapping: + data = columns + if DRY_RUN or DEBUG: - print line - print self.columnlist, data + print '<', line + print ' ', self.columnlist + print '>', data print if not DRY_RUN: diff --git a/reformat/mysql.py b/reformat/mysql.py new file mode 100644 index 0000000..e44ae99 --- /dev/null +++ b/reformat/mysql.py @@ -0,0 +1,24 @@ +# Author: Dimitri Fontaine +# +# pgloader mysql reformating module +# +from pgloader.tools import PGLoader_Error + +def timestamp(reject, input): + """ Reformat str as a PostgreSQL timestamp + + MySQL timestamps are like: 20041002152952 + We want instead this input: 2004-10-02 15:29:52 + """ + if len(input) != 14: + e = "MySQL timestamp reformat input too short: %s" % input + raise PGLoader_Error, e + + year = input[0:4] + month = input[4:6] + day = input[6:8] + hour = input[8:10] + minute = input[10:12] + seconds = input[12:14] + + return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, month, seconds)