mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-05 02:46:10 +02:00
Configuration now knows about templates, and command line about a reformat
path option, --reformat_path= or -R.
This commit is contained in:
parent
3a8ac261c8
commit
1e7a16b869
8
debian/changelog
vendored
8
debian/changelog
vendored
@ -1,3 +1,11 @@
|
||||
pgloader (2.2.5-dev) unstable; urgency=low
|
||||
|
||||
* Configuration now supports templates
|
||||
* Command line option for setting --reformat_path, -R
|
||||
*
|
||||
|
||||
-- Dimitri Fontaine <dim@tapoueh.org> Mon, 26 Nov 2007 21:53:11 +0100
|
||||
|
||||
pgloader (2.2.4) unstable; urgency=low
|
||||
|
||||
* Reformat modules to change input on-the-fly
|
||||
|
||||
56
pgloader.py
56
pgloader.py
@ -101,6 +101,10 @@ def parse_options():
|
||||
default = None,
|
||||
help = "input files encoding")
|
||||
|
||||
parser.add_option("-R", "--reformat_path", dest = "reformat_path",
|
||||
default = None,
|
||||
help = "PATH where to find reformat python modules")
|
||||
|
||||
(opts, args) = parser.parse_args()
|
||||
|
||||
if opts.version:
|
||||
@ -129,10 +133,13 @@ def parse_options():
|
||||
print "Error: Can't be verbose and quiet at the same time!"
|
||||
sys.exit(1)
|
||||
|
||||
# if debug, then verbose
|
||||
if opts.debug:
|
||||
opts.verbose = True
|
||||
|
||||
pgloader.options.DRY_RUN = opts.dryrun
|
||||
pgloader.options.DEBUG = opts.debug
|
||||
# if debug, then verbose
|
||||
pgloader.options.VERBOSE = opts.verbose or opts.debug
|
||||
pgloader.options.VERBOSE = opts.verbose
|
||||
pgloader.options.QUIET = opts.quiet
|
||||
pgloader.options.SUMMARY = opts.summary
|
||||
pgloader.options.PEDANTIC = opts.pedantic
|
||||
@ -146,6 +153,9 @@ def parse_options():
|
||||
|
||||
pgloader.options.INPUT_ENCODING = opts.encoding
|
||||
|
||||
if opts.reformat_path:
|
||||
pgloader.options.REFORMAT_PATH = opts.reformat_path
|
||||
|
||||
return opts.config, args
|
||||
|
||||
def parse_config(conffile):
|
||||
@ -227,19 +237,10 @@ def parse_config(conffile):
|
||||
config.get(section, 'empty_string'))
|
||||
|
||||
if config.has_option(section, 'reformat_path'):
|
||||
import os.path
|
||||
reformat_path = []
|
||||
tmp_rpath = config.get(section, 'reformat_path')
|
||||
|
||||
for p in tmp_rpath.split(':'):
|
||||
if os.path.exists(p):
|
||||
reformat_path.append(p)
|
||||
else:
|
||||
print 'Error: reformat_path %s does not exists, ignored'%p
|
||||
|
||||
pgloader.options.REFORMAT_PATH = reformat_path
|
||||
else:
|
||||
pgloader.reformat_path = None
|
||||
# command line value is prefered to config format one
|
||||
if not pgloader.options.REFORMAT_PATH:
|
||||
rpath = config.get(section, 'reformat_path')
|
||||
pgloader.options.REFORMAT_PATH = rpath
|
||||
|
||||
except Exception, error:
|
||||
print "Error: Could not initialize PostgreSQL connection:"
|
||||
@ -361,6 +362,25 @@ def load_data():
|
||||
# now init db connection
|
||||
config, dbconn = parse_config(conffile)
|
||||
|
||||
from pgloader.tools import read_path, check_path
|
||||
from pgloader.options import VERBOSE
|
||||
import pgloader.options
|
||||
rpath = read_path(pgloader.options.REFORMAT_PATH, check = False)
|
||||
crpath = check_path(rpath, VERBOSE)
|
||||
|
||||
if not crpath:
|
||||
# don't check same path entries twice
|
||||
default_rpath = set(crpath) \
|
||||
- set(pgloader.options.DEFAULT_REFORMAT_PATH)
|
||||
|
||||
pgloader.options.REFORMAT_PATH = check_path(default_rpath, VERBOSE)
|
||||
else:
|
||||
pgloader.options.REFORMAT_PATH = rpath
|
||||
|
||||
if VERBOSE:
|
||||
print 'Notice: Reformat path is', pgloader.options.REFORMAT_PATH
|
||||
print
|
||||
|
||||
# load some pgloader package modules
|
||||
from pgloader.options import VERBOSE, DEBUG, QUIET, SUMMARY
|
||||
from pgloader.options import DRY_RUN, PEDANTIC, VACUUM
|
||||
@ -392,11 +412,17 @@ def load_data():
|
||||
sections.sort()
|
||||
for s in sections:
|
||||
try:
|
||||
if VERBOSE:
|
||||
print
|
||||
|
||||
pgloader = PGLoader(s, config, dbconn)
|
||||
|
||||
if not pgloader.template:
|
||||
pgloader.run()
|
||||
summary[s] = (pgloader.table,) + pgloader.summary()
|
||||
else:
|
||||
if VERBOSE:
|
||||
print "Skipping section %s, which is a template" % s
|
||||
|
||||
except PGLoader_Error, e:
|
||||
if e == '':
|
||||
|
||||
@ -0,0 +1,3 @@
|
||||
"""
|
||||
pgloader package, offering modules to implement pgloader.
|
||||
"""
|
||||
@ -2,7 +2,7 @@
|
||||
#
|
||||
# Some common options, for each module to get them
|
||||
|
||||
PGLOADER_VERSION = '2.2.4'
|
||||
PGLOADER_VERSION = '2.2.5-devel'
|
||||
|
||||
INPUT_ENCODING = None
|
||||
PG_CLIENT_ENCODING = 'latin9'
|
||||
@ -32,4 +32,5 @@ FROM_ID = None
|
||||
|
||||
UDC_PREFIX = 'udc_'
|
||||
|
||||
REFORMAT_PATH = ['/usr/share/pgloader/reformat']
|
||||
REFORMAT_PATH = None
|
||||
DEFAULT_REFORMAT_PATH = ['/usr/share/pgloader/reformat']
|
||||
|
||||
@ -54,11 +54,9 @@ class PGLoader:
|
||||
|
||||
# just skip it here
|
||||
if VERBOSE:
|
||||
print
|
||||
print "[%s] skip template configuration" % self.name
|
||||
print "[%s] is a template" % self.name
|
||||
|
||||
if not self.template and VERBOSE:
|
||||
print
|
||||
print "[%s] parse configuration" % self.name
|
||||
|
||||
if not self.template:
|
||||
@ -83,11 +81,17 @@ class PGLoader:
|
||||
|
||||
# now load specific configuration
|
||||
if VERBOSE:
|
||||
print
|
||||
print "Reading configuration from section [%s]" % name
|
||||
|
||||
self.__read_conf__(name, config, db)
|
||||
|
||||
# force reinit of self.reader, which depends on template and
|
||||
# specific options
|
||||
if 'reader' in self.__dict__:
|
||||
self.reader.__init__(self.db, self.reject,
|
||||
self.filename, self.input_encoding,
|
||||
self.table, self.columns)
|
||||
|
||||
if DEBUG:
|
||||
print '%s init done' % name
|
||||
print
|
||||
@ -181,11 +185,15 @@ class PGLoader:
|
||||
print 'columns', self.columns
|
||||
print 'blob_columns', self.blob_cols
|
||||
|
||||
if self.name == name and not self.columns:
|
||||
print 'Error: %s has no columns defined' % name
|
||||
self.config_errors += 1
|
||||
if self.columns is None:
|
||||
if not self.template:
|
||||
print 'Error: %s has no columns defined' % name
|
||||
self.config_errors += 1
|
||||
|
||||
self.columns = []
|
||||
else:
|
||||
# non critical error, and code thereafter wants to use
|
||||
# self.columns as a list
|
||||
self.columns = []
|
||||
|
||||
##
|
||||
# The config section can also provide user-defined colums
|
||||
@ -331,9 +339,7 @@ class PGLoader:
|
||||
self.columnlist = [n for (n, pos) in self.columns]
|
||||
|
||||
if DEBUG:
|
||||
#print "columns", self.columns
|
||||
print "only_cols", self.only_cols
|
||||
#print "udcs", self.udcs
|
||||
print "columnlist", self.columnlist
|
||||
|
||||
##
|
||||
@ -364,47 +370,28 @@ class PGLoader:
|
||||
if config.has_option(name, 'format'):
|
||||
self.format = config.get(name, 'format')
|
||||
|
||||
if 'reader' not in self.__dict__:
|
||||
if DEBUG:
|
||||
print 'READER INIT'
|
||||
|
||||
if self.format.lower() == 'csv':
|
||||
from csvreader import CSVReader
|
||||
self.reader = CSVReader(self.db, self.reject,
|
||||
self.filename,
|
||||
self.input_encoding,
|
||||
self.table, self.columns)
|
||||
if self.format.lower() == 'csv':
|
||||
from csvreader import CSVReader
|
||||
self.reader = CSVReader(self.db, self.reject,
|
||||
self.filename, self.input_encoding,
|
||||
self.table, self.columns)
|
||||
|
||||
elif self.format.lower() == 'text':
|
||||
from textreader import TextReader
|
||||
self.reader = TextReader(self.db, self.reject,
|
||||
self.filename,
|
||||
self.input_encoding,
|
||||
self.table, self.columns,
|
||||
self.newline_escapes)
|
||||
elif self.format.lower() == 'text':
|
||||
from textreader import TextReader
|
||||
self.reader = TextReader(self.db, self.reject,
|
||||
self.filename, self.input_encoding,
|
||||
self.table, self.columns,
|
||||
self.newline_escapes)
|
||||
|
||||
self.reader.readconfig(name, config)
|
||||
if 'reader' in self.__dict__:
|
||||
if DEBUG:
|
||||
print 'reader.readconfig()'
|
||||
self.reader.readconfig(name, config)
|
||||
|
||||
if not self.template and self.format is None:
|
||||
# error only when not loading the Template part
|
||||
print 'Error: %s: format parameter needed' % name
|
||||
raise PGLoader_Error
|
||||
else:
|
||||
if DEBUG:
|
||||
print 'MANUAL REINIT OF READER'
|
||||
self.reader.reject = self.reject
|
||||
self.reader.filename = self.filename
|
||||
self.reader.input_encoding = self.input_encoding
|
||||
self.reader.newline_escapes = self.newline_escapes
|
||||
self.reader.readconfig(name, config)
|
||||
|
||||
print 'BLURPS', self.reader.trailing_sep
|
||||
|
||||
## ##
|
||||
## # parse the reader specific section options
|
||||
## if not self.template:
|
||||
## self.reader.readconfig(name, config)
|
||||
## print 'BLURPS', self.reader.trailing_sep
|
||||
|
||||
##
|
||||
# Some column might need reformating
|
||||
@ -444,8 +431,10 @@ class PGLoader:
|
||||
print 'Error: %s failed to import reformat module "%s"' \
|
||||
% (name, r_module)
|
||||
print ' from %s' % str(REFORMAT_PATH)
|
||||
print ' %s' % e
|
||||
self.config_errors += 1
|
||||
|
||||
|
||||
if module:
|
||||
if r_function in module.__dict__:
|
||||
self.reformat.append((r_colname,
|
||||
|
||||
@ -22,6 +22,9 @@ class DataReader:
|
||||
|
||||
def __init__(self, db, reject, filename, input_encoding, table, columns):
|
||||
""" init internal variables """
|
||||
if DEBUG:
|
||||
print 'reader __init__', filename, table, columns
|
||||
|
||||
self.db = db
|
||||
self.filename = filename
|
||||
self.input_encoding = input_encoding
|
||||
@ -32,7 +35,7 @@ class DataReader:
|
||||
if self.input_encoding is None:
|
||||
if INPUT_ENCODING is not None:
|
||||
self.input_encoding = INPUT_ENCODING
|
||||
|
||||
|
||||
def readconfig(self, name, config):
|
||||
""" read configuration section for common options
|
||||
|
||||
@ -67,8 +70,9 @@ class DataReader:
|
||||
self.db.copy_sep = self.field_sep
|
||||
|
||||
if DEBUG and not DRY_RUN:
|
||||
print "null: '%s'" % self.db.null
|
||||
print "empty_string: '%s'" % self.db.empty_string
|
||||
print "reader.readconfig null: '%s'" % self.db.null
|
||||
print "reader.readconfig empty_string: '%s'" \
|
||||
% self.db.empty_string
|
||||
|
||||
def readlines(self):
|
||||
""" read data from configured file, and generate (yields) for
|
||||
|
||||
@ -31,28 +31,39 @@ class TextReader(DataReader):
|
||||
"""
|
||||
|
||||
def __init__(self, db, reject, filename, input_encoding,
|
||||
table, columns, newline_escapes):
|
||||
table, columns, newline_escapes = None):
|
||||
""" init textreader with a newline_escapes parameter """
|
||||
DataReader.__init__(self, db, reject,
|
||||
filename, input_encoding, table, columns)
|
||||
|
||||
self.newline_escapes = newline_escapes
|
||||
|
||||
if 'newline_escapes' not in self.__dict__:
|
||||
self.newline_escapes = newline_escapes
|
||||
|
||||
def readconfig(self, name, config):
|
||||
""" get this reader module configuration from config file """
|
||||
DataReader.readconfig(self, name, config)
|
||||
|
||||
# this will be called twice if templates are in used, so we
|
||||
# have to protect ourselves against removing already read
|
||||
# configurations while in second run.
|
||||
|
||||
# optionnal number of columns per line
|
||||
self.field_count = None
|
||||
if 'field_count' not in self.__dict__:
|
||||
self.field_count = None
|
||||
|
||||
if config.has_option(name, 'field_count'):
|
||||
self.field_count = config.getint(name, 'field_count')
|
||||
|
||||
# optionnal trailing separator option
|
||||
self.trailing_sep = False
|
||||
if 'trailing_sep' not in self.__dict__:
|
||||
self.trailing_sep = False
|
||||
|
||||
if config.has_option(name, 'trailing_sep'):
|
||||
self.trailing_sep = config.get(name, 'trailing_sep') == 'True'
|
||||
|
||||
if DEBUG:
|
||||
print 'reader.readconfig: field_count', self.field_count
|
||||
print 'reader.readconfig: trailing_sep', self.trailing_sep
|
||||
|
||||
def readlines(self):
|
||||
""" read data from configured file, and generate (yields) for
|
||||
|
||||
@ -116,3 +116,34 @@ def parse_config_string(str):
|
||||
|
||||
|
||||
|
||||
def read_path(strpath, verbose = False, path = [], check = True):
|
||||
""" read a path configuration element, discarding non-existing entries """
|
||||
import os.path
|
||||
|
||||
for p in strpath.split(':'):
|
||||
path.append(p)
|
||||
|
||||
if check:
|
||||
return check_path(path, verbose)
|
||||
else:
|
||||
return path
|
||||
|
||||
def check_path(path, verbose = False):
|
||||
""" removes non existant and non {directories, symlink} entries from path
|
||||
"""
|
||||
path_ok = []
|
||||
|
||||
for p in path:
|
||||
if os.path.exists(p):
|
||||
if os.path.isdir(p) or \
|
||||
(os.path.islink(p) and os.path.isdir(os.path.realpath(p))):
|
||||
path_ok.append(p)
|
||||
else:
|
||||
if verbose:
|
||||
print "Warning: path entry '%s' " % p + \
|
||||
"is not a directory or does not link to a directory"
|
||||
else:
|
||||
if verbose:
|
||||
print "Warning: path entry '%s' does not exists, ignored" % p
|
||||
|
||||
return path_ok
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#
|
||||
# pgloader mysql reformating module
|
||||
#
|
||||
from pgloader.tools import PGLoader_Error
|
||||
|
||||
def timestamp(reject, input):
|
||||
""" Reformat str as a PostgreSQL timestamp
|
||||
@ -12,7 +11,7 @@ def timestamp(reject, input):
|
||||
"""
|
||||
if len(input) != 14:
|
||||
e = "MySQL timestamp reformat input too short: %s" % input
|
||||
raise PGLoader_Error, e
|
||||
reject.log(e, input)
|
||||
|
||||
year = input[0:4]
|
||||
month = input[4:6]
|
||||
@ -21,4 +20,4 @@ def timestamp(reject, input):
|
||||
minute = input[10:12]
|
||||
seconds = input[12:14]
|
||||
|
||||
return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, month, seconds)
|
||||
return '%s-%s-%s %s:%s:%s' % (year, month, day, hour, minute, seconds)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user