pgloader can now reformat data on-the-fly, using modules in reformat package,

dynamically loading them as they're found in the configuration.

Some User-Defined Columns bugfixes too.
This commit is contained in:
dim 2007-11-20 11:54:07 +00:00
parent e9b884cb4e
commit 9b9ef1cc05
3 changed files with 128 additions and 10 deletions

View File

@ -9,12 +9,14 @@ exdir = $(DESTDIR)/usr/share/doc/pgloader
pgloader = pgloader.py
examples = examples
libs = $(wildcard pgloader/*.py)
refm = $(wildcard reformat/*.py)
install:
install -m 755 $(pgloader) $(libdir)
install -m 755 -d $(libdir)/pgloader
cp -a $(libs) $(libdir)/pgloader
cp -a $(refm) $(libdir)/reformat
cp -a $(examples) $(exdir)
html: $(DOCS)

View File

@ -168,9 +168,16 @@ class PGLoader:
[n for (n, v) in self.udcs]
copy_columns = config.get(name, 'copy_columns').split(',')
self.copy_columns = [x.strip()
for x in copy_columns
if x.strip() in namelist]
self.copy_columns = []
for x in copy_columns:
x = x.strip(' \n\r')
if x not in namelist:
print 'Error: "%s" not in %s column list, ' \
% (x, name) +\
'including user defined columns'
else:
self.copy_columns.append(x)
if len(self.copy_columns) != len(copy_columns):
print 'Error: %s.copy_columns refers to ' % name +\
@ -313,6 +320,57 @@ class PGLoader:
print 'Error: %s: format parameter needed' % name
raise PGLoader_Error
##
# Some column might need reformating
if config.has_option(name, 'reformat'):
self._parse_fields('c_reformat', config.get(name, 'reformat'),
btype = True, argtype = 'string')
else:
self.reformat = None
if DEBUG:
print 'reformat:', self.c_reformat
# check the configure reformating is available
if self.c_reformat:
import imp
self.reformat = []
for r_colname, r_module, r_function in self.c_reformat:
if r_colname not in self.columnlist:
print 'Error: %s.reformat refers to unknown column %s' \
% ( name, r_colname )
self.config_errors += 1
# load the given module name and function
module = None
try:
fp, pathname, description = \
imp.find_module(r_module,
['reformat',
# explicit debian packaging support
'/usr/share/pgloader/reformat'])
module = imp.load_module(r_module,
fp, pathname, description)
except ImportError, e:
print 'Error: %s failed to import reformat module %s' \
% (name, r_module)
self.config_errors += 1
if module:
if r_function in module.__dict__:
self.reformat.append((r_colname,
module.__dict__[r_function]))
else:
print 'Error: reformat module %s has no %s function'%\
(r_module, r_function)
self.config_errors += 1
if DEBUG:
print 'reformat', self.reformat
##
# parse the reader specific section options
self.reader.readconfig(name, config)
@ -382,13 +440,17 @@ class PGLoader:
# arg is the target column index
try:
arg = int(arg)
except ValueError:
raise PGLoader_Error
except ValueError, e:
raise PGLoader_Error, e
elif argtype == 'char':
# arg is an escape char
if len(arg) > 1:
raise PGLoader_Error
raise PGLoader_Error, 'more than one character for char'
elif argtype == 'string':
# accept all inputs
pass
return arg
@ -474,15 +536,40 @@ class PGLoader:
def data_import(self):
""" import CSV or TEXT data, using COPY """
# some more practical data format of internals
ddict = dict(self.columns)
if self.reformat:
drefc = dict(self.reformat)
if self.udcs:
dudcs = dict(self.udcs)
for line, columns in self.reader.readlines():
if self.blob_cols is not None:
columns, rowids = self.read_blob(line, columns)
data = columns
if self.reformat:
refc = dict(self.reformat)
data = []
for cname, cpos in self.columns:
if cname in drefc:
# reformat the column value
data.append(drefc[cname](self.reject,
columns[cpos-1]))
else:
data.append(columns[cpos-1])
if DEBUG:
print 'reformat'
print 'columns', columns
print 'data ', data
# we want next steps to take reformated data as input
columns = data
if self.udcs:
dudcs = dict(self.udcs)
ddict = dict(self.columns)
data = []
for c in self.copy_columns:
if c in ddict:
@ -491,6 +578,7 @@ class PGLoader:
data.append(dudcs[c])
if DEBUG:
print 'udcs'
print 'columns', columns
print 'data ', data
@ -513,9 +601,13 @@ class PGLoader:
else:
data = [columns[i-1] for i in self.only_cols]
if not self.reformat and not self.udcs and not self.col_mapping:
data = columns
if DRY_RUN or DEBUG:
print line
print self.columnlist, data
print '<', line
print ' ', self.columnlist
print '>', data
print
if not DRY_RUN:

24
reformat/mysql.py Normal file
View File

@ -0,0 +1,24 @@
# Author: Dimitri Fontaine <dim@tapoueh.org>
#
# pgloader mysql reformating module
#
from pgloader.tools import PGLoader_Error
def timestamp(reject, input):
""" Reformat str as a PostgreSQL timestamp
MySQL timestamps are like: 20041002152952
We want instead this input: 2004-10-02 15:29:52
"""
if len(input) != 14:
e = "MySQL timestamp reformat input too short: %s" % input
raise PGLoader_Error, e
year = input[0:4]
month = input[4:6]
day = input[6:8]
hour = input[8:10]
minute = input[10:12]
seconds = input[12:14]
return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, month, seconds)