diff --git a/debian/changelog b/debian/changelog index 7d36197..8a4dd30 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,10 +1,20 @@ +pgloader (2.2.3) unstable; urgency=low + + * User Defined Columns + * Temporary files with copy data content now suffixed .pgloader + * New option --version + * Fix TextReader newline_escapes configuration option reading + * Fix Reader reject initialisation + * Skip database related settings when in DRY_RUN mode (-n) + * List all command line options from man page synopsis + + -- Dimitri Fontaine Wed, 14 Nov 2007 21:57:39 +0100 + pgloader (2.2.2) unstable; urgency=low * New command line options --quiet and --summary (-qs for short) - * Bugfix: temp filename no more contains columns (per user report) - * debian package now properly include and install the man page - -- Dimitri Fontaine Sun, 11 Nov 2007 20:44:23 +0100 + -- Dimitri Fontaine Sat, 20 Oct 2007 16:20:18 +0200 pgloader (2.2.1) unstable; urgency=low diff --git a/examples/udc/udc.data b/examples/udc/udc.data new file mode 100644 index 0000000..37d377e --- /dev/null +++ b/examples/udc/udc.data @@ -0,0 +1,5 @@ +1§5§foo§bar +2§10§bar§toto +3§4§toto§titi +4§18§titi§baz +5§2§baz§foo diff --git a/examples/udc/udc.sql b/examples/udc/udc.sql new file mode 100644 index 0000000..3558801 --- /dev/null +++ b/examples/udc/udc.sql @@ -0,0 +1,5 @@ +CREATE TABLE udc ( + b integer primary key, + c text, + d integer +); \ No newline at end of file diff --git a/pgloader.1.txt b/pgloader.1.txt index f071fac..af68647 100644 --- a/pgloader.1.txt +++ b/pgloader.1.txt @@ -6,9 +6,11 @@ pgloader - Import CSV data and Large Object to PostgreSQL == SYNOPSIS == - pgloader [-c configuration file] [-p pedantic] [-d debug] [-v verbose] - [-n dryrun] [-Cn count] [-Fn from] [-In from id] [-E input files encoding] - [Section1 Section2] +pgloader [--version] [-c configuration file] + [-p pedantic] [-d debug] [-v verbose] [-q quiet] [-s summary] + [-n dryrun] [-Cn count] [-Fn from] [-In from id] + [-E input files encoding] + [Section1 Section2] == DESCRIPTION == @@ -31,9 +33,9 @@ content properly escaped to the +COPY+ data. +pgloader+ issue some timing statistics every +commit_every+ commits (see Configuration for this setting). At the end of each section -processing, a summary of overall operations, numbers of updates and -commits, time it took in seconds, errors logged and database errors is -issued. +processing, a summary of overall operations, numbers of rows copied +and commits, time it took in seconds, errors logged and database +errors is issued. +pgloader+ is available from +pgfoundry+ at http://pgfoundry.org/projects/pgloader/[], where you'll find a debian @@ -45,6 +47,10 @@ In order for pgloader to run, you have to edit a configuration file (see Configuration) consisting of Section definitions. Each section refers to a PostgreSQL table into which some data is to be loaded. +--version:: + + print out pgloader version, then quit. + -c, --config:: specifies the configuration file to use. The default file name is @@ -184,6 +190,7 @@ single big +COPY+ attempt, but copy copy_every lines at a time. + This parameter is optionnal and defaults to 10000. +////////////////////////////////////////// commit_every:: + PostgreSQL +COMMIT+ frequency, exprimed in +UPDATE+ orders. A good @@ -194,6 +201,7 @@ input lines. closing and when a SQL error occurs. + This parameter is optionnal and defaults to 1000. +////////////////////////////////////////// copy_delimiter:: + @@ -312,6 +320,7 @@ This parameter is optionnal and defaults to '\ ' (that is backslash followed by space). If defined on a table level, this local value will overwritte the global one. +////////////////////////////////////////// index:: + Table index definition, to be used in blob +UPDATE+'ing. You define an @@ -321,11 +330,15 @@ table has a composite key, then you can define multiple columns here, separated by a comma. + index = colname:3, other_colname:5 +////////////////////////////////////////// columns:: + -You can define here table columns, with the same definition format as -in previous index parameter. +You can define here table columns, by giving their names and +optionnaly column number (as found into your data file, and counting +from 1) separated by a colon. ++ + columns = x, y, a, b, d:6, c:5 + Note you'll have to define here all the columns to be found in data file, whether you want to use them all or not. When not using them @@ -335,18 +348,50 @@ As of +pgloader 2.2+ the column list used might not be the same as the table columns definition. + As of +pgloader 2.2.1+ you can omit column numbering if you want to, a -counter is then maintained for you, starting from 1 and set to +last -value + 1+ on each column, where +last value+ was either computed or +counter is then maintained for you, starting from 1 and set to +$$last +value + 1$$+ on each column, where +last value+ was either computed or given in the config. So you can even omit only 'some' columns in there. -+ - columns = x, y, a, b, d:6, c:5 + In case you have a lot a columns per table, you will want to use multiple lines for this parameter value. Python ConfigParser module knows how to read multi-line parameters, you don't have to escape anything. +user_defined_columns:: ++ +Those are special columns not found in the data file but which you +want to load into the database. The configuration options beginning +with +udc_+ are taken as column names with constant values. The +following example define the column +c+ as having the value +constant +value+ for each and every row of the input data file. ++ + udc_c = constant value ++ +The option +copy_columns+ is used to define the exact +columnsList+ +given to +COPY+. ++ +A simple use case is the loading into the same database table of data +coming from more than one file. If you need to keep track of the data +origin, add a column to the table model and define a 'udc_' for ++pgloader+ to add a constant value in the database. ++ +Using user-defined columns require defining +copy_columns+ and is not +compatible with +only_cols+ usage. ++ + +copy_columns:: ++ +This options defines the columns to load from the input data file and +the user defined columns, and in which order to do this. Place here +the column names separated by commas. ++ + copy_columns = b, c, d ++ +This option is required if any user column is defined, and conflicts +with the +only_cols+ option. It won't have any effect when used in a +section where no user column is defined. + only_cols:: + If you want to only load a part of the columns you have into the data @@ -358,6 +403,9 @@ following example. + This parameter is optionnal and defaults to the list of all columns given on the columns parameter list, in the colname order. ++ +This option conflicts with user defined columns and +copy_columns+ +option. blob_columns:: + diff --git a/pgloader.py b/pgloader.py index af49192..8ce0755 100644 --- a/pgloader.py +++ b/pgloader.py @@ -36,6 +36,11 @@ def parse_options(): usage = "%prog [-c ] Section [Section ...]" parser = OptionParser(usage = usage) + parser.add_option("--version", action = "store_true", + dest = "version", + default = False, + help = "show pgloader version") + parser.add_option("-c", "--config", dest = "config", default = "pgloader.conf", help = "configuration file, defauts to pgloader.conf") @@ -98,6 +103,10 @@ def parse_options(): (opts, args) = parser.parse_args() + if opts.version: + print "PgLoader version %s" % pgloader.options.PGLOADER_VERSION + sys.exit(0) + # check existence en read ability of config file if not os.path.exists(opts.config): print "Error: Configuration file %s does not exists" % opts.config diff --git a/pgloader/db.py b/pgloader/db.py index 291be8f..4163328 100644 --- a/pgloader/db.py +++ b/pgloader/db.py @@ -267,7 +267,7 @@ class db: """ save copy buffer to a temporary file for further inspection """ import tempfile (f, n) = tempfile.mkstemp(prefix='%s.' % tablename, - suffix='.pgimport', dir='/tmp') + suffix='.pgloader', dir='/tmp') os.write(f, self.buffer.getvalue()) os.close(f) @@ -276,7 +276,7 @@ class db: print " -- COPY data buffer saved in %s --" % n return n - def copy_from(self, table, table_colspec, columns, input_line, + def copy_from(self, table, columnlist, columns, input_line, reject, EOF = False): """ Generate some COPY SQL for PostgreSQL """ ok = True @@ -286,7 +286,7 @@ class db: # build the table colomns specs from parameters # ie. we always issue COPY table (col1, col2, ..., coln) commands tablename = table - table = "%s (%s) " % (table, ", ".join(table_colspec)) + table = "%s (%s) " % (table, ", ".join(columnlist)) if DEBUG: print 'COPY %s' % table diff --git a/pgloader/options.py b/pgloader/options.py index 35ba728..f0e7725 100644 --- a/pgloader/options.py +++ b/pgloader/options.py @@ -2,6 +2,8 @@ # # Some common options, for each module to get them +PGLOADER_VERSION = '2.2.3' + INPUT_ENCODING = None PG_CLIENT_ENCODING = 'latin9' DATESTYLE = None @@ -28,5 +30,4 @@ COUNT = None FROM_COUNT = None FROM_ID = None - - +UDC_PREFIX = 'udc_' diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index 00cd46a..c7341b3 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -18,6 +18,7 @@ from options import COUNT, FROM_COUNT, FROM_ID from options import INPUT_ENCODING, PG_CLIENT_ENCODING from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING from options import NEWLINE_ESCAPES +from options import UDC_PREFIX class PGLoader: """ @@ -72,7 +73,7 @@ class PGLoader: self.db.client_encoding = parse_config_string( config.get(name, 'client_encoding')) - if DEBUG: + if DEBUG and not DRY_RUN: print "client_encoding: '%s'" % self.db.client_encoding @@ -80,7 +81,7 @@ class PGLoader: if config.has_option(name, 'datestyle'): self.db.datestyle = config.get(name, 'datestyle') - if DEBUG: + if DEBUG and not DRY_RUN: print "datestyle: '%s'" % self.db.datestyle @@ -112,22 +113,110 @@ class PGLoader: print 'blob_columns', self.blob_cols + ## + # The config section can also provide user-defined colums + # which are option beginning with options.UDC_PREFIX + udcs = [o + for o in config.options(name) + if o[:len(UDC_PREFIX)] == UDC_PREFIX] + + if len(udcs) > 0: + self.udcs = [] + for udc in udcs: + udc_name = udc[:] + udc_name = udc_name[udc_name.find('_')+1:] + udc_value = config.get(name, udc) + + self.udcs.append((udc_name, udc_value)) + else: + self.udcs = None + + if DEBUG: + print 'udcs:', self.udcs + + # better check there's no user defined column overriding file + # columns + if self.udcs: + errs = [] + cols = [c for (c, cn) in self.columns] + for (udc_name, udc_value) in self.udcs: + if udc_name in cols: + errs.append(udc_name) + + if errs: + for c in errs: + print 'Error: %s is configured both as a ' % c +\ + '%s.columns entry and as a user-defined column' \ + % name + + self.config_errors += 1 + + # we need the copy_columns parameter if user-defined columns + # are used + if self.udcs: + if config.has_option(name, 'copy_columns'): + namelist = [n for (n, c) in self.columns] + \ + [n for (n, v) in self.udcs] + + copy_columns = config.get(name, 'copy_columns').split(',') + self.copy_columns = [x.strip() + for x in copy_columns + if x.strip() in namelist] + + if len(self.copy_columns) != len(copy_columns): + print 'Error: %s.copy_columns refers to ' % name +\ + 'unconfigured columns ' + + self.config_errors += 1 + + else: + print 'Error: section %s does not define ' % name +\ + 'copy_columns but uses user-defined columns' + + self.config_errors += 1 + + # in the copy_columns case, columnlist is that simple: + self.columnlist = None + if self.udcs: + if self.copy_columns: + self.columnlist = self.copy_columns + + if DEBUG: + print 'udcs', self.udcs + if self.udcs: + print 'copy_columns', self.copy_columns + ## # We have for example columns = col1:2, col2:1 # this means the order of input columns is not the same as the # awaited order of COPY, so we want a mapping index, here [2, 1] - if self.columns is not None: - self.col_mapping = [i for (c, i) in self.columns] + # + # The column mapping is to be done on all_columns, which + # allows user to have their user-defined columns talken into + # account in the COPY ordering. + + self.col_mapping = [i for (c, i) in self.columns] + + if self.col_mapping == range(1, len(self.columns)+1): + # no mapping to do + self.col_mapping = None ## # optionnal partial loading option (sequences case) - # self.table_colspec is the column list to give to + # + # self.columnlist is the column list to give to # COPY table(...) command, either the cols given in # the only_cols config, or the columns directly + self.only_cols = None - self.table_colspec = [n for (n, pos) in self.columns] if config.has_option(name, 'only_cols'): + if self.udcs: + print 'Error: section %s defines both ' % name +\ + 'user-defined columns and only_cols' + + self.config_errors += 1 + self.only_cols = config.get(name, 'only_cols') ## @@ -147,17 +236,50 @@ class PGLoader: else: expanded.append(int(oc)) - self.only_cols = expanded - self.table_colspec = [self.columns[x-1][0] for x in expanded] + # we have to find colspec based on self.columns + self.only_cols = expanded + self.columnlist = [self.columns[x-1][0] for x in expanded] except Exception, e: - print 'Error: section %s, only_cols: configured range is invalid' % name + print 'Error: section %s, only_cols: ' % name +\ + 'configured range is invalid' raise PGLoader_Error, e - if DEBUG: - print "only_cols", self.only_cols - print "table_colspec", self.table_colspec + if self.only_cols is None: + if self.columnlist is None: + # default case, no user-defined cols, no restriction + self.columnlist = [n for (n, pos) in self.columns] + if DEBUG: + #print "columns", self.columns + print "only_cols", self.only_cols + #print "udcs", self.udcs + print "columnlist", self.columnlist + + ## + # This option is textreader specific, but being lazy and + # short-timed, I don't make self._parse_fields() callable from + # outside this class. Hence the code here. + # + # optionnal newline escaped option + self.newline_escapes = [] + if config.has_option(name, 'newline_escapes'): + if NEWLINE_ESCAPES is not None: + # this parameter is globally set, will ignore local + # definition + if not QUIET: + print "Warning: ignoring %s newline_escapes option" % name + print " option is set to '%s' globally" \ + % NEWLINE_ESCAPES + else: + self._parse_fields('newline_escapes', + config.get(name, 'newline_escapes'), + argtype = 'char') + + if NEWLINE_ESCAPES is not None: + # set NEWLINE_ESCAPES for each table column + self.newline_escapes = [(a, NEWLINE_ESCAPES) + for (a, x) in self.columns] ## # data format, from which depend data reader @@ -167,11 +289,16 @@ class PGLoader: if self.format.lower() == 'csv': from csvreader import CSVReader - self.reader = CSVReader(self.db, self.filename, self.table, self.columns) + self.reader = CSVReader(self.db, self.reject, + self.filename, + self.table, self.columns) elif self.format.lower() == 'text': from textreader import TextReader - self.reader = TextReader(self.db, self.filename, self.table, self.columns) + self.reader = TextReader(self.db, self.reject, + self.filename, + self.table, self.columns, + self.newline_escapes) if self.format is None: print 'Error: %s: format parameter needed' % name @@ -342,32 +469,53 @@ class PGLoader: if self.blob_cols is not None: columns, rowids = self.read_blob(line, columns) - if DEBUG: - print self.col_mapping - print len(columns), len(self.col_mapping) + data = columns + + if self.udcs: + dudcs = dict(self.udcs) + ddict = dict(self.columns) + data = [] + for c in self.copy_columns: + if c in ddict: + data.append(columns[ddict[c]-1]) + else: + data.append(dudcs[c]) + + if DEBUG: + print 'columns', columns + print 'data ', data - ## - # Now we have to reorder the columns to match schema, and only - # consider data matched by self.only_cols - if self.only_cols is not None: - c_ordered = [columns[self.col_mapping[i-1]-1] for i in self.only_cols] else: - c_ordered = [columns[i-1] for i in self.col_mapping] + if self.col_mapping: + if DEBUG: + print 'col_mapping', self.col_mapping + + data = [columns[i-1] for i in self.col_mapping] + + if DEBUG: + print 'columns', columns + print 'data ', data + + if self.only_cols: + # only consider data matched by self.only_cols + if self.col_mapping: + data = [columns[self.col_mapping[i-1]-1] + for i in self.only_cols] + else: + data = [columns[i-1] for i in self.only_cols] if DRY_RUN or DEBUG: print line - print c_ordered - print len(c_ordered) - print self.table_colspec + print self.columnlist, data print if not DRY_RUN: - self.db.copy_from(self.table, self.table_colspec, - c_ordered, line, self.reject) + self.db.copy_from(self.table, self.columnlist, + data, line, self.reject) if not DRY_RUN: # we may need a last COPY for the rest of data - self.db.copy_from(self.table, self.table_colspec, + self.db.copy_from(self.table, self.columnlist, None, None, self.reject, EOF = True) return diff --git a/pgloader/reader.py b/pgloader/reader.py index cbab066..989e835 100644 --- a/pgloader/reader.py +++ b/pgloader/reader.py @@ -20,12 +20,13 @@ class DataReader: - multi-line support is explicit (via """ - def __init__(self, db, filename, table, columns): + def __init__(self, db, reject, filename, table, columns): """ init internal variables """ self.db = db self.filename = filename self.table = table self.columns = columns + self.reject = reject def readconfig(self, name, config): """ read configuration section for common options @@ -37,18 +38,19 @@ class DataReader: see textreader.py and csvreader.py """ - # optionnal null and empty_string per table parameters - if config.has_option(name, 'null'): - self.db.null = parse_config_string(config.get(name, 'null')) - else: - self.db.null = NULL - if config.has_option(name, 'empty_string'): - self.db.empty_string = parse_config_string( - config.get(name, 'empty_string')) - else: - self.db.empty_string = EMPTY_STRING + if not DRY_RUN: + # optionnal null and empty_string per table parameters + if config.has_option(name, 'null'): + self.db.null = parse_config_string(config.get(name, 'null')) + else: + self.db.null = NULL + if config.has_option(name, 'empty_string'): + self.db.empty_string = parse_config_string( + config.get(name, 'empty_string')) + else: + self.db.empty_string = EMPTY_STRING # optionnal field separator self.field_sep = FIELD_SEP @@ -59,7 +61,7 @@ class DataReader: if self.db.copy_sep is None: self.db.copy_sep = self.field_sep - if DEBUG: + if DEBUG and not DRY_RUN: print "null: '%s'" % self.db.null print "empty_string: '%s'" % self.db.empty_string diff --git a/pgloader/textreader.py b/pgloader/textreader.py index 84e3277..b557c9a 100644 --- a/pgloader/textreader.py +++ b/pgloader/textreader.py @@ -30,10 +30,17 @@ class TextReader(DataReader): - ... """ + def __init__(self, db, reject, filename, table, columns, newline_escapes): + """ init textreader with a newline_escapes parameter """ + DataReader.__init__(self, db, reject, filename, table, columns) + + self.newline_escapes = newline_escapes + + def readconfig(self, name, config): """ get this reader module configuration from config file """ DataReader.readconfig(self, name, config) - + # optionnal number of columns per line self.field_count = None if config.has_option(name, 'field_count'): @@ -44,27 +51,6 @@ class TextReader(DataReader): if config.has_option(name, 'trailing_sep'): self.trailing_sep = config.get(name, 'trailing_sep') == 'True' - # optionnal newline escaped option - self.newline_escapes = [] - if config.has_option(name, 'newline_escapes'): - if NEWLINE_ESCAPES is not None: - # this parameter is globally set, will ignore local - # definition - if not QUIET: - print "Warning: ignoring %s newline_escapes option" % name - print " option is set to '%s' globally" \ - % NEWLINE_ESCAPES - else: - self._parse_fields('newline_escapes', - config.get(name, 'newline_escapes'), - argtype = 'char') - - if NEWLINE_ESCAPES is not None: - # set NEWLINE_ESCAPES for each table column - self.newline_escapes = [(a, NEWLINE_ESCAPES) - for (a, x) in self.columns] - - def readlines(self): """ read data from configured file, and generate (yields) for