Configuration now knows about templates, and command line about a reformat

path option, --reformat_path= or -R.
This commit is contained in:
dim 2007-11-26 21:33:24 +00:00
parent 3a8ac261c8
commit 1e7a16b869
9 changed files with 144 additions and 72 deletions

8
debian/changelog vendored
View File

@ -1,3 +1,11 @@
pgloader (2.2.5-dev) unstable; urgency=low
* Configuration now supports templates
* Command line option for setting --reformat_path, -R
*
-- Dimitri Fontaine <dim@tapoueh.org> Mon, 26 Nov 2007 21:53:11 +0100
pgloader (2.2.4) unstable; urgency=low
* Reformat modules to change input on-the-fly

View File

@ -101,6 +101,10 @@ def parse_options():
default = None,
help = "input files encoding")
parser.add_option("-R", "--reformat_path", dest = "reformat_path",
default = None,
help = "PATH where to find reformat python modules")
(opts, args) = parser.parse_args()
if opts.version:
@ -129,10 +133,13 @@ def parse_options():
print "Error: Can't be verbose and quiet at the same time!"
sys.exit(1)
# if debug, then verbose
if opts.debug:
opts.verbose = True
pgloader.options.DRY_RUN = opts.dryrun
pgloader.options.DEBUG = opts.debug
# if debug, then verbose
pgloader.options.VERBOSE = opts.verbose or opts.debug
pgloader.options.VERBOSE = opts.verbose
pgloader.options.QUIET = opts.quiet
pgloader.options.SUMMARY = opts.summary
pgloader.options.PEDANTIC = opts.pedantic
@ -146,6 +153,9 @@ def parse_options():
pgloader.options.INPUT_ENCODING = opts.encoding
if opts.reformat_path:
pgloader.options.REFORMAT_PATH = opts.reformat_path
return opts.config, args
def parse_config(conffile):
@ -227,19 +237,10 @@ def parse_config(conffile):
config.get(section, 'empty_string'))
if config.has_option(section, 'reformat_path'):
import os.path
reformat_path = []
tmp_rpath = config.get(section, 'reformat_path')
for p in tmp_rpath.split(':'):
if os.path.exists(p):
reformat_path.append(p)
else:
print 'Error: reformat_path %s does not exists, ignored'%p
pgloader.options.REFORMAT_PATH = reformat_path
else:
pgloader.reformat_path = None
# command line value is prefered to config format one
if not pgloader.options.REFORMAT_PATH:
rpath = config.get(section, 'reformat_path')
pgloader.options.REFORMAT_PATH = rpath
except Exception, error:
print "Error: Could not initialize PostgreSQL connection:"
@ -361,6 +362,25 @@ def load_data():
# now init db connection
config, dbconn = parse_config(conffile)
from pgloader.tools import read_path, check_path
from pgloader.options import VERBOSE
import pgloader.options
rpath = read_path(pgloader.options.REFORMAT_PATH, check = False)
crpath = check_path(rpath, VERBOSE)
if not crpath:
# don't check same path entries twice
default_rpath = set(crpath) \
- set(pgloader.options.DEFAULT_REFORMAT_PATH)
pgloader.options.REFORMAT_PATH = check_path(default_rpath, VERBOSE)
else:
pgloader.options.REFORMAT_PATH = rpath
if VERBOSE:
print 'Notice: Reformat path is', pgloader.options.REFORMAT_PATH
print
# load some pgloader package modules
from pgloader.options import VERBOSE, DEBUG, QUIET, SUMMARY
from pgloader.options import DRY_RUN, PEDANTIC, VACUUM
@ -392,11 +412,17 @@ def load_data():
sections.sort()
for s in sections:
try:
if VERBOSE:
print
pgloader = PGLoader(s, config, dbconn)
if not pgloader.template:
pgloader.run()
summary[s] = (pgloader.table,) + pgloader.summary()
else:
if VERBOSE:
print "Skipping section %s, which is a template" % s
except PGLoader_Error, e:
if e == '':

View File

@ -0,0 +1,3 @@
"""
pgloader package, offering modules to implement pgloader.
"""

View File

@ -2,7 +2,7 @@
#
# Some common options, for each module to get them
PGLOADER_VERSION = '2.2.4'
PGLOADER_VERSION = '2.2.5-devel'
INPUT_ENCODING = None
PG_CLIENT_ENCODING = 'latin9'
@ -32,4 +32,5 @@ FROM_ID = None
UDC_PREFIX = 'udc_'
REFORMAT_PATH = ['/usr/share/pgloader/reformat']
REFORMAT_PATH = None
DEFAULT_REFORMAT_PATH = ['/usr/share/pgloader/reformat']

View File

@ -54,11 +54,9 @@ class PGLoader:
# just skip it here
if VERBOSE:
print
print "[%s] skip template configuration" % self.name
print "[%s] is a template" % self.name
if not self.template and VERBOSE:
print
print "[%s] parse configuration" % self.name
if not self.template:
@ -83,11 +81,17 @@ class PGLoader:
# now load specific configuration
if VERBOSE:
print
print "Reading configuration from section [%s]" % name
self.__read_conf__(name, config, db)
# force reinit of self.reader, which depends on template and
# specific options
if 'reader' in self.__dict__:
self.reader.__init__(self.db, self.reject,
self.filename, self.input_encoding,
self.table, self.columns)
if DEBUG:
print '%s init done' % name
print
@ -181,11 +185,15 @@ class PGLoader:
print 'columns', self.columns
print 'blob_columns', self.blob_cols
if self.name == name and not self.columns:
print 'Error: %s has no columns defined' % name
self.config_errors += 1
if self.columns is None:
if not self.template:
print 'Error: %s has no columns defined' % name
self.config_errors += 1
self.columns = []
else:
# non critical error, and code thereafter wants to use
# self.columns as a list
self.columns = []
##
# The config section can also provide user-defined colums
@ -331,9 +339,7 @@ class PGLoader:
self.columnlist = [n for (n, pos) in self.columns]
if DEBUG:
#print "columns", self.columns
print "only_cols", self.only_cols
#print "udcs", self.udcs
print "columnlist", self.columnlist
##
@ -364,47 +370,28 @@ class PGLoader:
if config.has_option(name, 'format'):
self.format = config.get(name, 'format')
if 'reader' not in self.__dict__:
if DEBUG:
print 'READER INIT'
if self.format.lower() == 'csv':
from csvreader import CSVReader
self.reader = CSVReader(self.db, self.reject,
self.filename,
self.input_encoding,
self.table, self.columns)
if self.format.lower() == 'csv':
from csvreader import CSVReader
self.reader = CSVReader(self.db, self.reject,
self.filename, self.input_encoding,
self.table, self.columns)
elif self.format.lower() == 'text':
from textreader import TextReader
self.reader = TextReader(self.db, self.reject,
self.filename,
self.input_encoding,
self.table, self.columns,
self.newline_escapes)
elif self.format.lower() == 'text':
from textreader import TextReader
self.reader = TextReader(self.db, self.reject,
self.filename, self.input_encoding,
self.table, self.columns,
self.newline_escapes)
self.reader.readconfig(name, config)
if 'reader' in self.__dict__:
if DEBUG:
print 'reader.readconfig()'
self.reader.readconfig(name, config)
if not self.template and self.format is None:
# error only when not loading the Template part
print 'Error: %s: format parameter needed' % name
raise PGLoader_Error
else:
if DEBUG:
print 'MANUAL REINIT OF READER'
self.reader.reject = self.reject
self.reader.filename = self.filename
self.reader.input_encoding = self.input_encoding
self.reader.newline_escapes = self.newline_escapes
self.reader.readconfig(name, config)
print 'BLURPS', self.reader.trailing_sep
## ##
## # parse the reader specific section options
## if not self.template:
## self.reader.readconfig(name, config)
## print 'BLURPS', self.reader.trailing_sep
##
# Some column might need reformating
@ -444,8 +431,10 @@ class PGLoader:
print 'Error: %s failed to import reformat module "%s"' \
% (name, r_module)
print ' from %s' % str(REFORMAT_PATH)
print ' %s' % e
self.config_errors += 1
if module:
if r_function in module.__dict__:
self.reformat.append((r_colname,

View File

@ -22,6 +22,9 @@ class DataReader:
def __init__(self, db, reject, filename, input_encoding, table, columns):
""" init internal variables """
if DEBUG:
print 'reader __init__', filename, table, columns
self.db = db
self.filename = filename
self.input_encoding = input_encoding
@ -32,7 +35,7 @@ class DataReader:
if self.input_encoding is None:
if INPUT_ENCODING is not None:
self.input_encoding = INPUT_ENCODING
def readconfig(self, name, config):
""" read configuration section for common options
@ -67,8 +70,9 @@ class DataReader:
self.db.copy_sep = self.field_sep
if DEBUG and not DRY_RUN:
print "null: '%s'" % self.db.null
print "empty_string: '%s'" % self.db.empty_string
print "reader.readconfig null: '%s'" % self.db.null
print "reader.readconfig empty_string: '%s'" \
% self.db.empty_string
def readlines(self):
""" read data from configured file, and generate (yields) for

View File

@ -31,28 +31,39 @@ class TextReader(DataReader):
"""
def __init__(self, db, reject, filename, input_encoding,
table, columns, newline_escapes):
table, columns, newline_escapes = None):
""" init textreader with a newline_escapes parameter """
DataReader.__init__(self, db, reject,
filename, input_encoding, table, columns)
self.newline_escapes = newline_escapes
if 'newline_escapes' not in self.__dict__:
self.newline_escapes = newline_escapes
def readconfig(self, name, config):
""" get this reader module configuration from config file """
DataReader.readconfig(self, name, config)
# this will be called twice if templates are in used, so we
# have to protect ourselves against removing already read
# configurations while in second run.
# optionnal number of columns per line
self.field_count = None
if 'field_count' not in self.__dict__:
self.field_count = None
if config.has_option(name, 'field_count'):
self.field_count = config.getint(name, 'field_count')
# optionnal trailing separator option
self.trailing_sep = False
if 'trailing_sep' not in self.__dict__:
self.trailing_sep = False
if config.has_option(name, 'trailing_sep'):
self.trailing_sep = config.get(name, 'trailing_sep') == 'True'
if DEBUG:
print 'reader.readconfig: field_count', self.field_count
print 'reader.readconfig: trailing_sep', self.trailing_sep
def readlines(self):
""" read data from configured file, and generate (yields) for

View File

@ -116,3 +116,34 @@ def parse_config_string(str):
def read_path(strpath, verbose = False, path = [], check = True):
""" read a path configuration element, discarding non-existing entries """
import os.path
for p in strpath.split(':'):
path.append(p)
if check:
return check_path(path, verbose)
else:
return path
def check_path(path, verbose = False):
""" removes non existant and non {directories, symlink} entries from path
"""
path_ok = []
for p in path:
if os.path.exists(p):
if os.path.isdir(p) or \
(os.path.islink(p) and os.path.isdir(os.path.realpath(p))):
path_ok.append(p)
else:
if verbose:
print "Warning: path entry '%s' " % p + \
"is not a directory or does not link to a directory"
else:
if verbose:
print "Warning: path entry '%s' does not exists, ignored" % p
return path_ok

View File

@ -2,7 +2,6 @@
#
# pgloader mysql reformating module
#
from pgloader.tools import PGLoader_Error
def timestamp(reject, input):
""" Reformat str as a PostgreSQL timestamp
@ -12,7 +11,7 @@ def timestamp(reject, input):
"""
if len(input) != 14:
e = "MySQL timestamp reformat input too short: %s" % input
raise PGLoader_Error, e
reject.log(e, input)
year = input[0:4]
month = input[4:6]
@ -21,4 +20,4 @@ def timestamp(reject, input):
minute = input[10:12]
seconds = input[12:14]
return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, month, seconds)
return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, minute, seconds)