mirror of
https://github.com/dimitri/pgloader.git
synced 2025-08-07 23:07:00 +02:00
DEBUG and documentation of reformat option, plus other fixes for having all
the test cases pass again.
This commit is contained in:
parent
9b9ef1cc05
commit
86b33defef
@ -48,25 +48,39 @@ The provided examples are:
|
||||
In this dataset the id field is ommited, it's a serial which will be
|
||||
automatically set by PostgreSQL while COPYing.
|
||||
|
||||
. reformat
|
||||
|
||||
A timestamp column is formated the way MySQL dump its timestamp,
|
||||
which is not the same as the way PostgreSQL reads them. The
|
||||
reformat.mysql module is used to reformat the data on-the-fly.
|
||||
|
||||
. udc
|
||||
|
||||
A used defined column test, where all file columns are not used but
|
||||
a new constant one, not found in the input datafile, is added while
|
||||
loading data.
|
||||
|
||||
You can launch all those pgloader tests in one run, provided you created the
|
||||
necessary tables:
|
||||
|
||||
$ for sql in */*sql; do psql pgloader < $sql; done
|
||||
$ ../pgloader.py -Tc pgloader.conf
|
||||
$ ../pgloader.py -Tsc pgloader.conf
|
||||
|
||||
[...]
|
||||
|
||||
Table name | duration | size | updates | errors
|
||||
Table name | duration | size | copy rows | errors
|
||||
====================================================================
|
||||
clob | 0.041s | 32 kB | 7 | 0
|
||||
cluttered | 0.037s | 32 kB | 6 | 0
|
||||
csv | 0.019s | 16 kB | 6 | 0
|
||||
errors | 0.032s | 32 kB | 4 | 3
|
||||
partial | 0.024s | 32 kB | 7 | 0
|
||||
serial | 0.028s | 32 kB | 7 | 0
|
||||
simple | 0.029s | 32 kB | 7 | 0
|
||||
clob | 0.043s | 32 kB | 7 | 0
|
||||
cluttered | 0.032s | 32 kB | 6 | 0
|
||||
csv | 0.031s | 16 kB | 6 | 0
|
||||
errors | 0.030s | 32 kB | 4 | 3
|
||||
partial | 0.078s | 32 kB | 7 | 0
|
||||
reformat | 0.018s | 24 kB | 4 | 1
|
||||
serial | 0.024s | 32 kB | 7 | 0
|
||||
simple | 0.024s | 32 kB | 7 | 0
|
||||
udc | 0.018s | 32 kB | 5 | 0
|
||||
====================================================================
|
||||
Total | 0.210s | 208 kB | 44 | 3
|
||||
|
||||
Total | 0.298s | 264 kB | 53 | 4
|
||||
|
||||
|
||||
Please note errors test should return 3 errors and reformat 1 error.
|
||||
|
@ -14,6 +14,8 @@ commit_every = 5
|
||||
null = ""
|
||||
empty_string = "\ "
|
||||
|
||||
reformat_path = /usr/share/pgloader/reformat
|
||||
|
||||
[simple]
|
||||
table = simple
|
||||
format = text
|
||||
@ -79,6 +81,14 @@ columns = b:2, d:1, x:3, y:4
|
||||
udc_c = constant value
|
||||
copy_columns = b, c, d
|
||||
|
||||
[reformat]
|
||||
table = reformat
|
||||
format = text
|
||||
filename = reformat/reformat.data
|
||||
field_sep = |
|
||||
columns = id, timestamp
|
||||
reformat = timestamp:mysql:timestamp
|
||||
|
||||
[csv]
|
||||
table = csv
|
||||
format = csv
|
||||
|
@ -229,16 +229,31 @@ null::
|
||||
You can configure here how null value is represented into your flat
|
||||
data file.
|
||||
+
|
||||
This parameter is optionnal and defaults to '' (that is +empty string+).
|
||||
This parameter is optionnal and defaults to +''+ (that is +empty string+).
|
||||
|
||||
empty_string::
|
||||
+
|
||||
You can configure here how empty values are represented into your flat
|
||||
data file.
|
||||
+
|
||||
This parameter is optionnal and defaults to '\ ' (that is backslash
|
||||
followed by space).
|
||||
This parameter is optionnal and defaults to +$$'\ '$$+ (that is
|
||||
backslash followed by space).
|
||||
|
||||
reformat_path::
|
||||
+
|
||||
When using +reformat+ option, provide here a colon separated path list
|
||||
where to look for reformating module.
|
||||
+
|
||||
reformat_path = .:/home/dim/PostgreSQL/pgfoundry/pgloader/reformat
|
||||
+
|
||||
The directories given here should exist and contain a
|
||||
+$$__init__.py$$+ file (for python to consider them as packages), the
|
||||
only modules and functions used in the package will be the one you
|
||||
configure with +reformat+ section specific option.
|
||||
+
|
||||
Default value is +/usr/share/pgloader/reformat+, which is where the
|
||||
provided +debian+ package of +pgloader+ installs the +reformat+
|
||||
modules.
|
||||
|
||||
== COMMON FORMAT CONFIGURATION PARAMETERS ==
|
||||
|
||||
@ -288,7 +303,7 @@ be used by the generated +COPY+ commands, thus +pgloader+ does not
|
||||
have to deal with escaping the delimiter it uses (input data has to
|
||||
have escaped it).
|
||||
+
|
||||
This parameter is optionnal and defaults to pipe char '|'.
|
||||
This parameter is optionnal and defaults to pipe char +$$'|'$$+.
|
||||
|
||||
client_encoding::
|
||||
+
|
||||
@ -427,6 +442,33 @@ Here's an example:
|
||||
+
|
||||
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
|
||||
|
||||
reformat::
|
||||
+
|
||||
Use this option when you need to preprocess some column data with
|
||||
+pgloader+ reformating modules, or your own. The value of this option is
|
||||
a comma separated list of columns to rewrite, which are a colon
|
||||
separated list of column name, reformat module name, reformat function
|
||||
name. Here's an example to reformat column +dt_cx+ with the
|
||||
+mysql.timestamp()+ reformating function:
|
||||
+
|
||||
reformat = dt_cx:mysql:timestamp
|
||||
+
|
||||
See global setting option +reformat_path+ for configuring where
|
||||
+pgloader+ will look for reformat packages and modules.
|
||||
+
|
||||
If you want to write a new formating function, provide a python
|
||||
package called +reformat+ (a directory of this name containing an
|
||||
empty +$$ __init__.py$$+ file will do) and place in there arbitrary named
|
||||
modules (+foo.py+ files) containing functions with the following
|
||||
signature:
|
||||
+
|
||||
def bar(reject, input)
|
||||
+
|
||||
The reject object has a +log(self, messages, data = None)+ method for
|
||||
you to log errors into +section.rej.log+ and +section.rej+ files.
|
||||
|
||||
|
||||
|
||||
== TEXT FORMAT CONFIGURATION PARAMETERS ==
|
||||
|
||||
field_count::
|
||||
|
15
pgloader.py
15
pgloader.py
@ -226,6 +226,21 @@ def parse_config(conffile):
|
||||
pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string(
|
||||
config.get(section, 'empty_string'))
|
||||
|
||||
if config.has_option(section, 'reformat_path'):
|
||||
import os.path
|
||||
reformat_path = []
|
||||
tmp_rpath = config.get(section, 'reformat_path')
|
||||
|
||||
for p in tmp_rpath.split(':'):
|
||||
if os.path.exists(p):
|
||||
reformat_path.append(p)
|
||||
else:
|
||||
print 'Error: reformat_path %s does not exists, ignored'%p
|
||||
|
||||
pgloader.options.REFORMAT_PATH = reformat_path
|
||||
else:
|
||||
pgloader.reformat_path = None
|
||||
|
||||
except Exception, error:
|
||||
print "Error: Could not initialize PostgreSQL connection:"
|
||||
print error
|
||||
|
@ -31,3 +31,5 @@ FROM_COUNT = None
|
||||
FROM_ID = None
|
||||
|
||||
UDC_PREFIX = 'udc_'
|
||||
|
||||
REFORMAT_PATH = ['/usr/share/pgloader/reformat']
|
||||
|
@ -19,6 +19,7 @@ from options import INPUT_ENCODING, PG_CLIENT_ENCODING
|
||||
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
|
||||
from options import NEWLINE_ESCAPES
|
||||
from options import UDC_PREFIX
|
||||
from options import REFORMAT_PATH
|
||||
|
||||
class PGLoader:
|
||||
"""
|
||||
@ -326,10 +327,10 @@ class PGLoader:
|
||||
self._parse_fields('c_reformat', config.get(name, 'reformat'),
|
||||
btype = True, argtype = 'string')
|
||||
else:
|
||||
self.reformat = None
|
||||
self.c_reformat = self.reformat = None
|
||||
|
||||
if DEBUG:
|
||||
print 'reformat:', self.c_reformat
|
||||
print 'reformat', self.c_reformat
|
||||
|
||||
# check the configure reformating is available
|
||||
if self.c_reformat:
|
||||
@ -346,17 +347,18 @@ class PGLoader:
|
||||
module = None
|
||||
try:
|
||||
fp, pathname, description = \
|
||||
imp.find_module(r_module,
|
||||
['reformat',
|
||||
# explicit debian packaging support
|
||||
'/usr/share/pgloader/reformat'])
|
||||
imp.find_module(r_module, REFORMAT_PATH)
|
||||
|
||||
if DEBUG:
|
||||
print 'Found %s at %s' % (r_module, pathname)
|
||||
|
||||
module = imp.load_module(r_module,
|
||||
fp, pathname, description)
|
||||
|
||||
except ImportError, e:
|
||||
print 'Error: %s failed to import reformat module %s' \
|
||||
print 'Error: %s failed to import reformat module "%s"' \
|
||||
% (name, r_module)
|
||||
print ' from %s' % str(REFORMAT_PATH)
|
||||
self.config_errors += 1
|
||||
|
||||
if module:
|
||||
@ -582,6 +584,8 @@ class PGLoader:
|
||||
print 'columns', columns
|
||||
print 'data ', data
|
||||
|
||||
columns = data
|
||||
|
||||
else:
|
||||
if self.col_mapping:
|
||||
if DEBUG:
|
||||
@ -593,15 +597,15 @@ class PGLoader:
|
||||
print 'columns', columns
|
||||
print 'data ', data
|
||||
|
||||
columns = data
|
||||
|
||||
if self.only_cols:
|
||||
# only consider data matched by self.only_cols
|
||||
if self.col_mapping:
|
||||
data = [columns[self.col_mapping[i-1]-1]
|
||||
for i in self.only_cols]
|
||||
else:
|
||||
data = [columns[i-1] for i in self.only_cols]
|
||||
|
||||
if not self.reformat and not self.udcs and not self.col_mapping:
|
||||
if not self.reformat \
|
||||
and not self.udcs \
|
||||
and not self.col_mapping \
|
||||
and not self.only_cols:
|
||||
data = columns
|
||||
|
||||
if DRY_RUN or DEBUG:
|
||||
|
Loading…
Reference in New Issue
Block a user