diff --git a/examples/README b/examples/README index e1fb48e..0010d73 100644 --- a/examples/README +++ b/examples/README @@ -48,25 +48,39 @@ The provided examples are: In this dataset the id field is ommited, it's a serial which will be automatically set by PostgreSQL while COPYing. +. reformat + + A timestamp column is formated the way MySQL dump its timestamp, + which is not the same as the way PostgreSQL reads them. The + reformat.mysql module is used to reformat the data on-the-fly. + +. udc + + A used defined column test, where all file columns are not used but + a new constant one, not found in the input datafile, is added while + loading data. + You can launch all those pgloader tests in one run, provided you created the necessary tables: $ for sql in */*sql; do psql pgloader < $sql; done - $ ../pgloader.py -Tc pgloader.conf + $ ../pgloader.py -Tsc pgloader.conf [...] - - Table name | duration | size | updates | errors + + Table name | duration | size | copy rows | errors ==================================================================== - clob | 0.041s | 32 kB | 7 | 0 - cluttered | 0.037s | 32 kB | 6 | 0 - csv | 0.019s | 16 kB | 6 | 0 - errors | 0.032s | 32 kB | 4 | 3 - partial | 0.024s | 32 kB | 7 | 0 - serial | 0.028s | 32 kB | 7 | 0 - simple | 0.029s | 32 kB | 7 | 0 + clob | 0.043s | 32 kB | 7 | 0 + cluttered | 0.032s | 32 kB | 6 | 0 + csv | 0.031s | 16 kB | 6 | 0 + errors | 0.030s | 32 kB | 4 | 3 + partial | 0.078s | 32 kB | 7 | 0 + reformat | 0.018s | 24 kB | 4 | 1 + serial | 0.024s | 32 kB | 7 | 0 + simple | 0.024s | 32 kB | 7 | 0 + udc | 0.018s | 32 kB | 5 | 0 ==================================================================== - Total | 0.210s | 208 kB | 44 | 3 - + Total | 0.298s | 264 kB | 53 | 4 +Please note errors test should return 3 errors and reformat 1 error. diff --git a/examples/pgloader.conf b/examples/pgloader.conf index 91da5cf..130680f 100644 --- a/examples/pgloader.conf +++ b/examples/pgloader.conf @@ -14,6 +14,8 @@ commit_every = 5 null = "" empty_string = "\ " +reformat_path = /usr/share/pgloader/reformat + [simple] table = simple format = text @@ -79,6 +81,14 @@ columns = b:2, d:1, x:3, y:4 udc_c = constant value copy_columns = b, c, d +[reformat] +table = reformat +format = text +filename = reformat/reformat.data +field_sep = | +columns = id, timestamp +reformat = timestamp:mysql:timestamp + [csv] table = csv format = csv diff --git a/pgloader.1.txt b/pgloader.1.txt index 83d9285..8e5e3bb 100644 --- a/pgloader.1.txt +++ b/pgloader.1.txt @@ -229,16 +229,31 @@ null:: You can configure here how null value is represented into your flat data file. + -This parameter is optionnal and defaults to '' (that is +empty string+). +This parameter is optionnal and defaults to +''+ (that is +empty string+). empty_string:: + You can configure here how empty values are represented into your flat data file. + -This parameter is optionnal and defaults to '\ ' (that is backslash -followed by space). +This parameter is optionnal and defaults to +$$'\ '$$+ (that is +backslash followed by space). +reformat_path:: ++ +When using +reformat+ option, provide here a colon separated path list +where to look for reformating module. ++ + reformat_path = .:/home/dim/PostgreSQL/pgfoundry/pgloader/reformat ++ +The directories given here should exist and contain a ++$$__init__.py$$+ file (for python to consider them as packages), the +only modules and functions used in the package will be the one you +configure with +reformat+ section specific option. ++ +Default value is +/usr/share/pgloader/reformat+, which is where the +provided +debian+ package of +pgloader+ installs the +reformat+ +modules. == COMMON FORMAT CONFIGURATION PARAMETERS == @@ -288,7 +303,7 @@ be used by the generated +COPY+ commands, thus +pgloader+ does not have to deal with escaping the delimiter it uses (input data has to have escaped it). + -This parameter is optionnal and defaults to pipe char '|'. +This parameter is optionnal and defaults to pipe char +$$'|'$$+. client_encoding:: + @@ -427,6 +442,33 @@ Here's an example: + blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob +reformat:: ++ +Use this option when you need to preprocess some column data with ++pgloader+ reformating modules, or your own. The value of this option is +a comma separated list of columns to rewrite, which are a colon +separated list of column name, reformat module name, reformat function +name. Here's an example to reformat column +dt_cx+ with the ++mysql.timestamp()+ reformating function: ++ + reformat = dt_cx:mysql:timestamp ++ +See global setting option +reformat_path+ for configuring where ++pgloader+ will look for reformat packages and modules. ++ +If you want to write a new formating function, provide a python +package called +reformat+ (a directory of this name containing an +empty +$$ __init__.py$$+ file will do) and place in there arbitrary named +modules (+foo.py+ files) containing functions with the following +signature: ++ + def bar(reject, input) ++ +The reject object has a +log(self, messages, data = None)+ method for +you to log errors into +section.rej.log+ and +section.rej+ files. + + + == TEXT FORMAT CONFIGURATION PARAMETERS == field_count:: diff --git a/pgloader.py b/pgloader.py index ecdb0c3..fda4135 100644 --- a/pgloader.py +++ b/pgloader.py @@ -226,6 +226,21 @@ def parse_config(conffile): pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string( config.get(section, 'empty_string')) + if config.has_option(section, 'reformat_path'): + import os.path + reformat_path = [] + tmp_rpath = config.get(section, 'reformat_path') + + for p in tmp_rpath.split(':'): + if os.path.exists(p): + reformat_path.append(p) + else: + print 'Error: reformat_path %s does not exists, ignored'%p + + pgloader.options.REFORMAT_PATH = reformat_path + else: + pgloader.reformat_path = None + except Exception, error: print "Error: Could not initialize PostgreSQL connection:" print error diff --git a/pgloader/options.py b/pgloader/options.py index c6ea4c8..a6d12e1 100644 --- a/pgloader/options.py +++ b/pgloader/options.py @@ -31,3 +31,5 @@ FROM_COUNT = None FROM_ID = None UDC_PREFIX = 'udc_' + +REFORMAT_PATH = ['/usr/share/pgloader/reformat'] diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index 06511dc..40d96d7 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -19,6 +19,7 @@ from options import INPUT_ENCODING, PG_CLIENT_ENCODING from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING from options import NEWLINE_ESCAPES from options import UDC_PREFIX +from options import REFORMAT_PATH class PGLoader: """ @@ -326,10 +327,10 @@ class PGLoader: self._parse_fields('c_reformat', config.get(name, 'reformat'), btype = True, argtype = 'string') else: - self.reformat = None + self.c_reformat = self.reformat = None if DEBUG: - print 'reformat:', self.c_reformat + print 'reformat', self.c_reformat # check the configure reformating is available if self.c_reformat: @@ -346,17 +347,18 @@ class PGLoader: module = None try: fp, pathname, description = \ - imp.find_module(r_module, - ['reformat', - # explicit debian packaging support - '/usr/share/pgloader/reformat']) + imp.find_module(r_module, REFORMAT_PATH) + + if DEBUG: + print 'Found %s at %s' % (r_module, pathname) module = imp.load_module(r_module, fp, pathname, description) except ImportError, e: - print 'Error: %s failed to import reformat module %s' \ + print 'Error: %s failed to import reformat module "%s"' \ % (name, r_module) + print ' from %s' % str(REFORMAT_PATH) self.config_errors += 1 if module: @@ -582,6 +584,8 @@ class PGLoader: print 'columns', columns print 'data ', data + columns = data + else: if self.col_mapping: if DEBUG: @@ -593,15 +597,15 @@ class PGLoader: print 'columns', columns print 'data ', data - if self.only_cols: - # only consider data matched by self.only_cols - if self.col_mapping: - data = [columns[self.col_mapping[i-1]-1] - for i in self.only_cols] - else: - data = [columns[i-1] for i in self.only_cols] + columns = data - if not self.reformat and not self.udcs and not self.col_mapping: + if self.only_cols: + data = [columns[i-1] for i in self.only_cols] + + if not self.reformat \ + and not self.udcs \ + and not self.col_mapping \ + and not self.only_cols: data = columns if DRY_RUN or DEBUG: