mirror of
https://github.com/dimitri/pgloader.git
synced 2025-08-07 23:07:00 +02:00
DEBUG and documentation of reformat option, plus other fixes for having all
the test cases pass again.
This commit is contained in:
parent
9b9ef1cc05
commit
86b33defef
@ -48,25 +48,39 @@ The provided examples are:
|
|||||||
In this dataset the id field is ommited, it's a serial which will be
|
In this dataset the id field is ommited, it's a serial which will be
|
||||||
automatically set by PostgreSQL while COPYing.
|
automatically set by PostgreSQL while COPYing.
|
||||||
|
|
||||||
|
. reformat
|
||||||
|
|
||||||
|
A timestamp column is formated the way MySQL dump its timestamp,
|
||||||
|
which is not the same as the way PostgreSQL reads them. The
|
||||||
|
reformat.mysql module is used to reformat the data on-the-fly.
|
||||||
|
|
||||||
|
. udc
|
||||||
|
|
||||||
|
A used defined column test, where all file columns are not used but
|
||||||
|
a new constant one, not found in the input datafile, is added while
|
||||||
|
loading data.
|
||||||
|
|
||||||
You can launch all those pgloader tests in one run, provided you created the
|
You can launch all those pgloader tests in one run, provided you created the
|
||||||
necessary tables:
|
necessary tables:
|
||||||
|
|
||||||
$ for sql in */*sql; do psql pgloader < $sql; done
|
$ for sql in */*sql; do psql pgloader < $sql; done
|
||||||
$ ../pgloader.py -Tc pgloader.conf
|
$ ../pgloader.py -Tsc pgloader.conf
|
||||||
|
|
||||||
[...]
|
[...]
|
||||||
|
|
||||||
Table name | duration | size | updates | errors
|
Table name | duration | size | copy rows | errors
|
||||||
====================================================================
|
====================================================================
|
||||||
clob | 0.041s | 32 kB | 7 | 0
|
clob | 0.043s | 32 kB | 7 | 0
|
||||||
cluttered | 0.037s | 32 kB | 6 | 0
|
cluttered | 0.032s | 32 kB | 6 | 0
|
||||||
csv | 0.019s | 16 kB | 6 | 0
|
csv | 0.031s | 16 kB | 6 | 0
|
||||||
errors | 0.032s | 32 kB | 4 | 3
|
errors | 0.030s | 32 kB | 4 | 3
|
||||||
partial | 0.024s | 32 kB | 7 | 0
|
partial | 0.078s | 32 kB | 7 | 0
|
||||||
serial | 0.028s | 32 kB | 7 | 0
|
reformat | 0.018s | 24 kB | 4 | 1
|
||||||
simple | 0.029s | 32 kB | 7 | 0
|
serial | 0.024s | 32 kB | 7 | 0
|
||||||
|
simple | 0.024s | 32 kB | 7 | 0
|
||||||
|
udc | 0.018s | 32 kB | 5 | 0
|
||||||
====================================================================
|
====================================================================
|
||||||
Total | 0.210s | 208 kB | 44 | 3
|
Total | 0.298s | 264 kB | 53 | 4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Please note errors test should return 3 errors and reformat 1 error.
|
||||||
|
@ -14,6 +14,8 @@ commit_every = 5
|
|||||||
null = ""
|
null = ""
|
||||||
empty_string = "\ "
|
empty_string = "\ "
|
||||||
|
|
||||||
|
reformat_path = /usr/share/pgloader/reformat
|
||||||
|
|
||||||
[simple]
|
[simple]
|
||||||
table = simple
|
table = simple
|
||||||
format = text
|
format = text
|
||||||
@ -79,6 +81,14 @@ columns = b:2, d:1, x:3, y:4
|
|||||||
udc_c = constant value
|
udc_c = constant value
|
||||||
copy_columns = b, c, d
|
copy_columns = b, c, d
|
||||||
|
|
||||||
|
[reformat]
|
||||||
|
table = reformat
|
||||||
|
format = text
|
||||||
|
filename = reformat/reformat.data
|
||||||
|
field_sep = |
|
||||||
|
columns = id, timestamp
|
||||||
|
reformat = timestamp:mysql:timestamp
|
||||||
|
|
||||||
[csv]
|
[csv]
|
||||||
table = csv
|
table = csv
|
||||||
format = csv
|
format = csv
|
||||||
|
@ -229,16 +229,31 @@ null::
|
|||||||
You can configure here how null value is represented into your flat
|
You can configure here how null value is represented into your flat
|
||||||
data file.
|
data file.
|
||||||
+
|
+
|
||||||
This parameter is optionnal and defaults to '' (that is +empty string+).
|
This parameter is optionnal and defaults to +''+ (that is +empty string+).
|
||||||
|
|
||||||
empty_string::
|
empty_string::
|
||||||
+
|
+
|
||||||
You can configure here how empty values are represented into your flat
|
You can configure here how empty values are represented into your flat
|
||||||
data file.
|
data file.
|
||||||
+
|
+
|
||||||
This parameter is optionnal and defaults to '\ ' (that is backslash
|
This parameter is optionnal and defaults to +$$'\ '$$+ (that is
|
||||||
followed by space).
|
backslash followed by space).
|
||||||
|
|
||||||
|
reformat_path::
|
||||||
|
+
|
||||||
|
When using +reformat+ option, provide here a colon separated path list
|
||||||
|
where to look for reformating module.
|
||||||
|
+
|
||||||
|
reformat_path = .:/home/dim/PostgreSQL/pgfoundry/pgloader/reformat
|
||||||
|
+
|
||||||
|
The directories given here should exist and contain a
|
||||||
|
+$$__init__.py$$+ file (for python to consider them as packages), the
|
||||||
|
only modules and functions used in the package will be the one you
|
||||||
|
configure with +reformat+ section specific option.
|
||||||
|
+
|
||||||
|
Default value is +/usr/share/pgloader/reformat+, which is where the
|
||||||
|
provided +debian+ package of +pgloader+ installs the +reformat+
|
||||||
|
modules.
|
||||||
|
|
||||||
== COMMON FORMAT CONFIGURATION PARAMETERS ==
|
== COMMON FORMAT CONFIGURATION PARAMETERS ==
|
||||||
|
|
||||||
@ -288,7 +303,7 @@ be used by the generated +COPY+ commands, thus +pgloader+ does not
|
|||||||
have to deal with escaping the delimiter it uses (input data has to
|
have to deal with escaping the delimiter it uses (input data has to
|
||||||
have escaped it).
|
have escaped it).
|
||||||
+
|
+
|
||||||
This parameter is optionnal and defaults to pipe char '|'.
|
This parameter is optionnal and defaults to pipe char +$$'|'$$+.
|
||||||
|
|
||||||
client_encoding::
|
client_encoding::
|
||||||
+
|
+
|
||||||
@ -427,6 +442,33 @@ Here's an example:
|
|||||||
+
|
+
|
||||||
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
|
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
|
||||||
|
|
||||||
|
reformat::
|
||||||
|
+
|
||||||
|
Use this option when you need to preprocess some column data with
|
||||||
|
+pgloader+ reformating modules, or your own. The value of this option is
|
||||||
|
a comma separated list of columns to rewrite, which are a colon
|
||||||
|
separated list of column name, reformat module name, reformat function
|
||||||
|
name. Here's an example to reformat column +dt_cx+ with the
|
||||||
|
+mysql.timestamp()+ reformating function:
|
||||||
|
+
|
||||||
|
reformat = dt_cx:mysql:timestamp
|
||||||
|
+
|
||||||
|
See global setting option +reformat_path+ for configuring where
|
||||||
|
+pgloader+ will look for reformat packages and modules.
|
||||||
|
+
|
||||||
|
If you want to write a new formating function, provide a python
|
||||||
|
package called +reformat+ (a directory of this name containing an
|
||||||
|
empty +$$ __init__.py$$+ file will do) and place in there arbitrary named
|
||||||
|
modules (+foo.py+ files) containing functions with the following
|
||||||
|
signature:
|
||||||
|
+
|
||||||
|
def bar(reject, input)
|
||||||
|
+
|
||||||
|
The reject object has a +log(self, messages, data = None)+ method for
|
||||||
|
you to log errors into +section.rej.log+ and +section.rej+ files.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
== TEXT FORMAT CONFIGURATION PARAMETERS ==
|
== TEXT FORMAT CONFIGURATION PARAMETERS ==
|
||||||
|
|
||||||
field_count::
|
field_count::
|
||||||
|
15
pgloader.py
15
pgloader.py
@ -226,6 +226,21 @@ def parse_config(conffile):
|
|||||||
pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string(
|
pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string(
|
||||||
config.get(section, 'empty_string'))
|
config.get(section, 'empty_string'))
|
||||||
|
|
||||||
|
if config.has_option(section, 'reformat_path'):
|
||||||
|
import os.path
|
||||||
|
reformat_path = []
|
||||||
|
tmp_rpath = config.get(section, 'reformat_path')
|
||||||
|
|
||||||
|
for p in tmp_rpath.split(':'):
|
||||||
|
if os.path.exists(p):
|
||||||
|
reformat_path.append(p)
|
||||||
|
else:
|
||||||
|
print 'Error: reformat_path %s does not exists, ignored'%p
|
||||||
|
|
||||||
|
pgloader.options.REFORMAT_PATH = reformat_path
|
||||||
|
else:
|
||||||
|
pgloader.reformat_path = None
|
||||||
|
|
||||||
except Exception, error:
|
except Exception, error:
|
||||||
print "Error: Could not initialize PostgreSQL connection:"
|
print "Error: Could not initialize PostgreSQL connection:"
|
||||||
print error
|
print error
|
||||||
|
@ -31,3 +31,5 @@ FROM_COUNT = None
|
|||||||
FROM_ID = None
|
FROM_ID = None
|
||||||
|
|
||||||
UDC_PREFIX = 'udc_'
|
UDC_PREFIX = 'udc_'
|
||||||
|
|
||||||
|
REFORMAT_PATH = ['/usr/share/pgloader/reformat']
|
||||||
|
@ -19,6 +19,7 @@ from options import INPUT_ENCODING, PG_CLIENT_ENCODING
|
|||||||
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
|
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
|
||||||
from options import NEWLINE_ESCAPES
|
from options import NEWLINE_ESCAPES
|
||||||
from options import UDC_PREFIX
|
from options import UDC_PREFIX
|
||||||
|
from options import REFORMAT_PATH
|
||||||
|
|
||||||
class PGLoader:
|
class PGLoader:
|
||||||
"""
|
"""
|
||||||
@ -326,10 +327,10 @@ class PGLoader:
|
|||||||
self._parse_fields('c_reformat', config.get(name, 'reformat'),
|
self._parse_fields('c_reformat', config.get(name, 'reformat'),
|
||||||
btype = True, argtype = 'string')
|
btype = True, argtype = 'string')
|
||||||
else:
|
else:
|
||||||
self.reformat = None
|
self.c_reformat = self.reformat = None
|
||||||
|
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
print 'reformat:', self.c_reformat
|
print 'reformat', self.c_reformat
|
||||||
|
|
||||||
# check the configure reformating is available
|
# check the configure reformating is available
|
||||||
if self.c_reformat:
|
if self.c_reformat:
|
||||||
@ -346,17 +347,18 @@ class PGLoader:
|
|||||||
module = None
|
module = None
|
||||||
try:
|
try:
|
||||||
fp, pathname, description = \
|
fp, pathname, description = \
|
||||||
imp.find_module(r_module,
|
imp.find_module(r_module, REFORMAT_PATH)
|
||||||
['reformat',
|
|
||||||
# explicit debian packaging support
|
if DEBUG:
|
||||||
'/usr/share/pgloader/reformat'])
|
print 'Found %s at %s' % (r_module, pathname)
|
||||||
|
|
||||||
module = imp.load_module(r_module,
|
module = imp.load_module(r_module,
|
||||||
fp, pathname, description)
|
fp, pathname, description)
|
||||||
|
|
||||||
except ImportError, e:
|
except ImportError, e:
|
||||||
print 'Error: %s failed to import reformat module %s' \
|
print 'Error: %s failed to import reformat module "%s"' \
|
||||||
% (name, r_module)
|
% (name, r_module)
|
||||||
|
print ' from %s' % str(REFORMAT_PATH)
|
||||||
self.config_errors += 1
|
self.config_errors += 1
|
||||||
|
|
||||||
if module:
|
if module:
|
||||||
@ -582,6 +584,8 @@ class PGLoader:
|
|||||||
print 'columns', columns
|
print 'columns', columns
|
||||||
print 'data ', data
|
print 'data ', data
|
||||||
|
|
||||||
|
columns = data
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if self.col_mapping:
|
if self.col_mapping:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
@ -593,15 +597,15 @@ class PGLoader:
|
|||||||
print 'columns', columns
|
print 'columns', columns
|
||||||
print 'data ', data
|
print 'data ', data
|
||||||
|
|
||||||
if self.only_cols:
|
columns = data
|
||||||
# only consider data matched by self.only_cols
|
|
||||||
if self.col_mapping:
|
|
||||||
data = [columns[self.col_mapping[i-1]-1]
|
|
||||||
for i in self.only_cols]
|
|
||||||
else:
|
|
||||||
data = [columns[i-1] for i in self.only_cols]
|
|
||||||
|
|
||||||
if not self.reformat and not self.udcs and not self.col_mapping:
|
if self.only_cols:
|
||||||
|
data = [columns[i-1] for i in self.only_cols]
|
||||||
|
|
||||||
|
if not self.reformat \
|
||||||
|
and not self.udcs \
|
||||||
|
and not self.col_mapping \
|
||||||
|
and not self.only_cols:
|
||||||
data = columns
|
data = columns
|
||||||
|
|
||||||
if DRY_RUN or DEBUG:
|
if DRY_RUN or DEBUG:
|
||||||
|
Loading…
Reference in New Issue
Block a user