From 921db51d65338d38d024b5ef9bb992c1c356415e Mon Sep 17 00:00:00 2001 From: dim Date: Thu, 23 Aug 2007 10:38:09 +0000 Subject: [PATCH] * Support for datestyle setting * Support for omiting column numbering * Change documentation source format from SGML to asciidoc * New version 2.2.1 --- Makefile | 14 +- debian/changelog | 8 + debian/files | 2 +- examples/pgloader.conf | 3 +- examples/simple/simple.data | 6 +- pgloader.1.sgml | 853 ------------------------------------ pgloader.1.txt | 486 ++++++++++++++++++++ pgloader.py | 16 +- pgloader/csvreader.py | 1 - pgloader/db.py | 25 +- pgloader/lo.py | 1 - pgloader/options.py | 2 +- pgloader/pgloader.py | 21 +- pgloader/reader.py | 1 - pgloader/textreader.py | 1 - pgloader/tools.py | 1 - 16 files changed, 559 insertions(+), 882 deletions(-) delete mode 100644 pgloader.1.sgml create mode 100644 pgloader.1.txt diff --git a/Makefile b/Makefile index 2c26c20..fbc8918 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,4 @@ -DOCS = pgloader.1.sgml -GARBAGE = manpage.links manpage.refs +DOCS = pgloader.1.txt # debian setting DESTDIR = @@ -18,6 +17,11 @@ install: cp -a $(libs) $(libdir)/pgloader cp -a $(examples) $(exdir) -man: $(DOCS) - docbook2man $(DOCS) 2>/dev/null - -rm -f $(GARBAGE) +html: $(DOCS) + asciidoc -a toc $< + +pgloader.1.xml: $(DOCS) + asciidoc -d manpage -b docbook $< + +man: pgloader.1.xml + xmlto man $< diff --git a/debian/changelog b/debian/changelog index 1ad75c0..ba241f6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +pgloader (2.2.1) unstable; urgency=low + + * Support for datestyle setting + * Support for omiting column numbering + * Change documentation source format from SGML to asciidoc + + -- Dimitri Fontaine Thu, 23 Aug 2007 12:35:34 +0200 + pgloader (2.2.0) unstable; urgency=low * Support for partial loading of data (subrange(s) of columns) diff --git a/debian/files b/debian/files index 841a98a..2dc1060 100644 --- a/debian/files +++ b/debian/files @@ -1 +1 @@ -pgloader_2.0.2_all.deb misc extra +pgloader_2.2.0_all.deb misc extra diff --git a/examples/pgloader.conf b/examples/pgloader.conf index b80ecdb..0e9fc5e 100644 --- a/examples/pgloader.conf +++ b/examples/pgloader.conf @@ -18,6 +18,7 @@ newline_escapes = \ [simple] table = simple format = text +datestyle = dmy filename = simple/simple.data field_sep = | trailing_sep = True @@ -75,6 +76,6 @@ format = csv filename = csv/csv.data field_sep = , quotechar = " -columns = x:1, y:2, a:3, b:4, c:5, d:6 +columns = x, y, a, b, d:6, c:5 only_cols = 3-6 diff --git a/examples/simple/simple.data b/examples/simple/simple.data index 6ef3d1f..5af4588 100644 --- a/examples/simple/simple.data +++ b/examples/simple/simple.data @@ -1,7 +1,7 @@ 1|some first row text|2006-11-11| -2|some second row text|2006-11-11| -3|some third row text|2006-10-12| +2|some second row text|13/11/2006| +3|some third row text|12-10-2006| 4|\ |2006-10-4| 5|some fifth row text|2006-5-12| -6|some sixth row text|2006-7-10| +6|some sixth row text|10/7/6| 7|some null date to play with|| \ No newline at end of file diff --git a/pgloader.1.sgml b/pgloader.1.sgml deleted file mode 100644 index cadf5d9..0000000 --- a/pgloader.1.sgml +++ /dev/null @@ -1,853 +0,0 @@ - - - -
- dim@tapoueh.org -
- - Dimitri - Fontaine - - August 2006 - - 2006 - Dimitri Fontaine - -
- - - pgloader - 1 - - - - pgloader - -Import CSV data and Large Object to PostgreSQL - - - - - - pgloader - configuration file - pedantic - debug - verbose - dry run - count - from - from id - input files encoding - Section1 Section2 - - - - - DESCRIPTION - - pgloader imports data from a flat file and - insert it into a database table. It uses a flat file per - database table, and you can configure as many Sections as you - want, each one associating a table name and a data file. - - - - Data are parsed and rewritten, then given to PostgreSQL - COPY command. Parsing is necessary for - dealing with end of lines and eventual trailing separator - characters, and for column reordering: your flat data file may - not have the same column order as the databse table has. - - - - pgloader is also able to load some large - objects data into PostgreSQL, as of now only Informix - UNLOAD data files are supported. This command - gives large objects data location information into the main data - file. pgloader parse it and produces and SQL - UPDATE order per large object, and commit those orders once - every commit_every configuration parameter. - - - - pgloader issue some timing statistics - every commit_every commits (see Configuration - for this setting). At the end of each section processing, a - summary of overall operations, numbers of updates and commits, - time it took in seconds, errors logged and database errors is - issued. - - - - - OPTIONS - - - In order for pgloader to run, you have to - edit a configuration file (see Configuration) consisting of - Section definitions. Each section refers to a PostgreSQL table - into which some data is to be loaded. - - - - - - - - - specifies the configuration file to use. The default file - name is pgloader.conf, searched into - current working directory. - - - - - - - - - - activates the pedantic mode, where any - warning is considered as a fatal error, thus stopping the - processing of the input file. - - - - - - - - - - makes pgloader say it all about what it - does. debug implies verbose. - - - - - - - - - - makes pgloader very verbose about - what it does. - - - - - - - - - - makes pgloader simulate operations, - that implies no database connection and no data extraction - from blob files. - - - - - - - - - - makes pgloader issue a truncate SQL - command before importing data. - - - - - - - - - - makes pgloader issue a vacuum full - verbose analyse SQL command before importing data. - - - This vacuum is run from shell command - /usr/bin/vacuumdb with connection - informations taken from configuration file (see - Configuration section of this manual page), but without - password prompting. If you use this option, please - configure your pg_hba.conf in a way - no password is prompted (trust). - - - - - - - - - - Number of input lines to process, default is to process - all the input lines. - - - - - - - - - - Input line number from which we begin to process (and - count). pgloader will skip all - preceding lines. - - - You can't use both and - at the same time. - - - - - - - - - - From which id do we begin to process - (and count) input lines. - - - When a composite key is used, you have to give each column - of the key separated by comma, on the form col_name=value. - - - Please notice using the --from-id - option implies pgloader will try to get - row id of each row, it being on the interval processed or - not. This could have some performance impact, and you may - end up prefering to use --from instead. - - - Example: pgloader -I col1:val1,col2:val2 - - - You can't use both and - at the same time. - - - - - - - - - - Input data files encoding. Defaults to 'latin9'. - - - - - - - - - is the name of a configured Section describing some data - to load - - - Section arguments are optionnal, if no section is given - all configured sections are processed. - - - - - - - - GLOBAL CONFIGURATION SECTION - - The configuration file has a .ini file syntax, its first section - has to be the pgsql one, defining how to - access to the PostgreSQL database server where to load - data. Then you may define any number of sections, each one - describing a data loading task to be performed by - pgloader. - - - - The [pgsql] section has the following - options, which all must be set. - - - - - - - PostgreSQL database server name, for example - localhost. - - - - - - - - - PostgreSQL database server listening port, 5432. You have - to fill this entry. - - - - - - - - - The name of the database you want to load data into. - - - - - - - - - Connecting PostgreSQL user name. - - - - - - - - - The password of the user. The better is to grant a - trust access privilege in PostgreSQL - pg_hba.conf. Then you can set this - entry to whatever value you want to. - - - - - - - - - Set this parameter to have pgloader - connects to PostgreSQL using this encoding. - - - This parameter is optionnal and defaults to 'latin9'. - - - - - - - - - When issuing COPY PostgreSQL commands, - pgloader will not make a single big - COPY attempt, but copy copy_every lines - at a time. - - - This parameter is optionnal and defaults to 10000. - - - - - - - - - PostgreSQL COMMIT frequency, exprimed - in UPDATE orders. A good value is 1000, - that means commiting the SQL transaction every 1000 input - lines. - - - pgloader issues commit every - commit_every updates, on connection closing and when a SQL - error occurs. - - - This parameter is optionnal and defaults to 1000. - - - - - - - - - The field separator to use in COPY FROM produced statements. If - you don't specify this, the same separator as the one given in - field_sep parameter will be used. - - - Please note PostgreSQL requires a single char - properly encoded (see your client_encoding - parameter), or it abort in error and even may crash. - - - This parameter is optionnal and defaults to - field_sep. - - - - - - - - - For parameter effect description, see below (same name, table - local setting). - - - You can setup here a global escape caracter, to be - considered on each and every column of each and every - text-format table defined thereafter. - - - - - - - - - You can configure here how null value is - represented into your flat data file. - - - This parameter is optionnal and defaults to - '' (that is empty string). - - - - - - - - - You can configure here how empty values are represented into - your flat data file. - - - This parameter is optionnal and defaults to '\ - ' (that is backslash followed by space). - - - - - - - - COMMON FORMAT CONFIGURATION PARAMETERS - - You then can define any number of data section, and give them an - arbitrary name. Some options are required, some are actually - optionnals, in which case it is said so thereafter. - - - First, we'll go through common parameters, applicable whichever - format of data you're refering to. Then text-format only - parameters will be presented, followed by csv-only parameters. - - - - - - - The table name of the database where to load data. - - - - - - - - - The format data are to be found, either - text or csv. - - - See next sections for format specific options. - - - - - - - - - The absolute path to the input data file. The large object - files are to be found into the same directory. Their name - can be in the form [bc]lob[0-9a-f]{4}.[0-9a-f]{3}, but - this information is not used by - pgloader. - - - - - - - - - In case of errors processing input data, a human readable - log per rejected input data line is produced into the - reject_log file. - - - - - - - - - In case of errors processing input data, the rejected - input line is appended to the reject_data file. - - - - - - - - - The field separator used into the data file. The same - separator will be used by the generated - COPY commands, thus - pgloader does not have to deal with - escaping the delimiter it uses (input data has to have - escaped it). - - - This parameter is optionnal and defaults to pipe char '|'. - - - - - - - - - Set this parameter to have pgloader - connects to PostgreSQL using this encoding. - - - This parameter is optionnal and defaults to 'latin9'. If defined - on a table level, this local value will overwritte the global - one. - - - - - - - - - You can configure here how null value is - represented into your flat data file. - - - This parameter is optionnal and defaults to - '' (that is empty string). If defined on a - table level, this local value will overwritte the global one. - - - - - - - - - You can configure here how empty values are represented into - your flat data file. - - - This parameter is optionnal and defaults to '\ - ' (that is backslash followed by space). If defined on - a table level, this local value will overwritte the global one. - - - - - - - - - Table index definition, to be used in blob UPDATE'ing. You - define an index column by giving its name and its column - number (as found into your data file, and counting from 1) - separated by a colon. If your table has a composite key, - then you can define multiple columns here, separated by a - comma. - - - index = colname:3, other_colname:5 - - - - - - - - - You can define here table columns, with the same - definition format as in previous index - parameter. - - - Note you'll have to define here all the columns to be - found in data file, whether you want to use them all or - not. When not using them all, use the - only_cols parameter to restrict. - - - As of pgloader 2.2 the column list used - might not be the same as the table columns definition. - - - In case you have a lot a columns per table, you will want - to use multiple lines for this parameter value. Python - ConfigParser module knows how to read - multi-line parameters, you don't have to escape anything. - - - - - - - - - If you want to only load a part of the columns you have - into the data file, this option let you define which - columns you're interrested in. only_col - is a comma separated list of ranges or values, as in - following example. - - - only_cols = 1-3, 5 - - - This parameter is optionnal and defaults to the list of - all columns given on the columns - parameter list, in the colname order. - - - - - - - - - The definition of the colums where to find some blob or - clob reference. This definition is composed by a table - column name, a column number (couting from one) reference - into the Informix UNLOAD data file, and - a large object type, separated by a colon. You can have - several columns in this field, separated by a - comma. - - - Supported large objects type are Informix blob and clob, - the awaited configuration string are respectively - ifx_blob for binary (bytea) content - type and ifx_clob for text type values. - - - Here's an example: - - - blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob - - - - - - - - TEXT FORMAT CONFIGURATION PARAMETERS - - - - - - The UNLOAD command does not escape - newlines when they appear into table data. Hence, you may - obtain multi-line data files, where a single database row - (say tuple if you prefer to) can span multiple physical - lines into the unloaded file. - - - If this is your case, you may want to configure here the - number of columns per tuple. Then - pgloader will count columns and - buffer line input in order to re-assemble several physical - lines into one data row when needed. - - - This parameter is optionnal. - - - - - - - - - If this option is set to True, the - input data file is known to append a - field_sep as the last character of each - of its lines. With this option set, this last character is - then not considered as a field separator. - - - This parameter is optionnal and defaults to False. - - - - - - - - - Sometimes the input data file has field values containing - newlines, and the export program used (as Informix - UNLOAD command) escape in-field - newlines. So you want pgloader to keep - those newlines, while at the same time preserving them. - - - This option does the described work on specified fields - and considering the escaping character you configure, - following this syntax: - - - newline_escapes = colname:\, other_colname:§ - - - This parameter is optionnal, and the extra work is only - done when set. You can configure - newline_escapes for as many fields as - necessary, and you may configure a different escaping - character each time. - - - Please note that at the moment, - pgloader does only support one - character length newline_escapes. - - - When both a global (see [pgsql] section) - newline_escapes parameter and a table local - one are set, pgloader issues a warning and - only consider the global setting. - - - - - - - - - CSV FORMAT CONFIGURATION PARAMETERS - - - - - - - Controls how instances of quotechar appearing inside a - field should be themselves be quoted. When True, the - character is doubled. When False, the escapechar is used - as a prefix to the quotechar. It defaults to True. - - - - - - - - - A one-character string used by the writer to escape the - delimiter if quoting is set to QUOTE_NONE and the - quotechar if doublequote is False. On reading, the - escapechar removes any special meaning from the following - character. It defaults to None, which disables escaping. - - - - - - - - - A one-character string used to quote fields containing - special characters, such as the delimiter or quotechar, or - which contain new-line characters. It defaults to '"'. - - - - - - - - - When True, whitespace immediately following the delimiter - is ignored. The default is False. - - - - - - - - - CONFIGURATION EXAMPLE - - Please see the given configuration example which should be distributed - in - /usr/share/doc/pgloader/examples/pgloader.conf. - - - - - HISTORY - - pgloader was at first an Informix to - PostgreSQL migration helper which imported Informix large - objects directly into a PostgreSQL database. - - - - Then as we got some data we couldn't file tools to care about, - we decided ifx_blob would become - pgloader, as it had to be able to import all - Informix UNLOAD data. Those data contains escaped separator into - unquoted data field and multi-lines fields (\r and \n are not - escaped). - - - - - BUGS - - Please report bugs to Dimitri Fontaine <dim@tapoueh.org>. - - - When last line is alone on a COPY command and its - parsing ends in error (not enough columns read for example), no - information is given back by pgloader. - - - - - AUTHORS - - pgloader is written by Dimitri - Fontaine dim@tapoueh.org. - - - -
diff --git a/pgloader.1.txt b/pgloader.1.txt new file mode 100644 index 0000000..b035393 --- /dev/null +++ b/pgloader.1.txt @@ -0,0 +1,486 @@ += pgloader(1) = + +== NAME == + +pgloader - Import CSV data and Large Object to PostgreSQL + +== SYNOPSIS == + + pgloader [-c configuration file] [-p pedantic] [-d debug] [-v verbose] + [-n dryrun] [-Cn count] [-Fn from] [-In from id] [-E input files encoding] + [Section1 Section2] + +== DESCRIPTION == + ++pgloader+ imports data from a flat file and insert it into a database +table. It uses a flat file per database table, and you can configure +as many Sections as you want, each one associating a table name and a +data file. + +Data are parsed and rewritten, then given to PostgreSQL +COPY+ +command. Parsing is necessary for dealing with end of lines and +eventual trailing separator characters, and for column reordering: +your flat data file may not have the same column order as the database +table has. + ++pgloader+ is also able to load some large objects data into +PostgreSQL, as of now only Informix +UNLOAD+ data files are +supported. This command gives large objects data location information +into the main data file. +pgloader+ parse it add the +text+ or +bytea+ +content properly escaped to the +COPY+ data. + ++pgloader+ issue some timing statistics every +commit_every+ commits +(see Configuration for this setting). At the end of each section +processing, a summary of overall operations, numbers of updates and +commits, time it took in seconds, errors logged and database errors is +issued. + ++pgloader+ is available from +pgfoundry+ at +http://pgfoundry.org/projects/pgloader/[], where you'll find a debian +package, a source package and an anonymous CVS. + +== OPTIONS == + +In order for pgloader to run, you have to edit a configuration file +(see Configuration) consisting of Section definitions. Each section +refers to a PostgreSQL table into which some data is to be loaded. + +-c, --config:: + + specifies the configuration file to use. The default file name is + pgloader.conf, searched into current working directory. + +-p, --pedantic:: + + activates the pedantic mode, where any warning is considered as a fatal + error, thus stopping the processing of the input file. + +-d, --debug:: + + makes pgloader say it all about what it does. debug implies verbose. + +-v, --verbose:: + + makes pgloader very verbose about what it does. + +-n, --dry-run:: + + makes pgloader simulate operations, that implies no database connection and + no data extraction from blob files. + +-T, --truncate:: + + makes pgloader issue a truncate SQL command before importing data. + +-V, --vacuum:: ++ +makes pgloader issue a vacuum full verbose analyse SQL command before +importing data. ++ +This vacuum is run from shell command /usr/bin/vacuumdb with +connection informations taken from configuration file (see +Configuration section of this manual page), but without password +prompting. If you use this option, please configure your pg_hba.conf +in a way no password is prompted (trust). + +-C, --count:: + + Number of input lines to process, default is to process all the input + lines. + +-F, --from:: ++ +Input line number from which we begin to process (and count). pgloader +will skip all preceding lines. ++ +You can't use both -F and -I at the same time. + +-I, --from-id:: ++ +From which id do we begin to process (and count) input lines. ++ +When a composite key is used, you have to give each column of the key +separated by comma, on the form col_name=value. ++ +Please notice using the --from-id option implies pgloader will try to +get row id of each row, it being on the interval processed or +not. This could have some performance impact, and you may end up +prefering to use --from instead. ++ + Example: pgloader -I col1:val1,col2:val2 ++ +You can't use both -F and -I at the same time. + +-E, --encoding:: + + Input data files encoding. Defaults to 'latin9'. + +Section:: ++ +is the name of a configured Section describing some data to load ++ +Section arguments are optionnal, if no section is given all configured +sections are processed. + +== GLOBAL CONFIGURATION SECTION == + +The configuration file has a .ini file syntax, its first section has +to be the pgsql one, defining how to access to the PostgreSQL database +server where to load data. Then you may define any number of sections, +each one describing a data loading task to be performed by pgloader. + +The [pgsql] section has the following options, which all must be set. + +host:: + + PostgreSQL database server name, for example localhost. + +port:: + + PostgreSQL database server listening port, 5432. You have to fill this + entry. + +base:: + + The name of the database you want to load data into. + +user:: + + Connecting PostgreSQL user name. + +pass:: + + The password of the user. The better is to grant a trust access privilege + in PostgreSQL pg_hba.conf. Then you can set this entry to whatever value + you want to. + +client_encoding:: ++ +Set this parameter to have pgloader connects to PostgreSQL using this +encoding. ++ +This parameter is optionnal and defaults to 'latin9'. + +datestyle:: ++ +Set this parameter to have pgloader connects to PostgreSQL using this +datestyle setting. ++ +This parameter is optionnal and has no default value, thus pgloader will +use whatever your PostgreSQL is configured to as default. + +copy_every:: ++ +When issuing +COPY+ PostgreSQL commands, pgloader will not make a +single big +COPY+ attempt, but copy copy_every lines at a time. ++ +This parameter is optionnal and defaults to 10000. + +commit_every:: ++ +PostgreSQL +COMMIT+ frequency, exprimed in +UPDATE+ orders. A good +value is 1000, that means commiting the SQL transaction every 1000 +input lines. ++ ++pgloader+ issues commit every +commit_every+ updates, on connection +closing and when a SQL error occurs. ++ +This parameter is optionnal and defaults to 1000. + +copy_delimiter:: ++ +The field separator to use in +COPY FROM+ produced statements. If you +don't specify this, the same separator as the one given in +field_sep+ +parameter will be used. ++ +Please note PostgreSQL requires a single char properly encoded (see +your +client_encoding+ parameter), or it abort in error and even may +crash. ++ +This parameter is optionnal and defaults to +field_sep+. + +newline_escapes:: ++ +For parameter effect description, see below (same name, table local +setting). ++ +You can setup here a global escape caracter, to be considered on each +and every column of each and every text-format table defined +thereafter. + +null:: ++ +You can configure here how null value is represented into your flat +data file. ++ +This parameter is optionnal and defaults to '' (that is +empty string+). + +empty_string:: ++ +You can configure here how empty values are represented into your flat +data file. ++ +This parameter is optionnal and defaults to '\ ' (that is backslash +followed by space). + + +== COMMON FORMAT CONFIGURATION PARAMETERS == + +You then can define any number of data section, and give them an arbitrary +name. Some options are required, some are actually optionnals, in which case it +is said so thereafter. + +First, we'll go through common parameters, applicable whichever format of data +you're refering to. Then text-format only parameters will be presented, +followed by csv-only parameters. + +table:: + + The table name of the database where to load data. + +format:: ++ +The format data are to be found, either text or csv. ++ +See next sections for format specific options. + +filename:: + + The absolute path to the input data file. The large object files + are to be found into the same directory. Their name can be in the + form +[bc]lob[0-9a-f]{4}.[0-9a-f]{3}+, but this information is not + used by +pgloader+. + +reject_log:: + + In case of errors processing input data, a human readable log per rejected + input data line is produced into the +reject_log+ file. + +reject_data:: + + In case of errors processing input data, the rejected input line is + appended to the +reject_data+ file. + +field_sep:: ++ +The field separator used into the data file. The same separator will +be used by the generated +COPY+ commands, thus +pgloader+ does not +have to deal with escaping the delimiter it uses (input data has to +have escaped it). ++ +This parameter is optionnal and defaults to pipe char '|'. + +client_encoding:: ++ +Set this parameter to have pgloader connects to PostgreSQL using this +encoding. ++ +This parameter is optionnal and defaults to 'latin9'. If defined on a +table level, this local value will overwritte the global one. + +datestyle:: ++ +Set this parameter to have pgloader connects to PostgreSQL using this ++datestyle+ setting. ++ +This parameter is optionnal and has no default. If defined on a table +level, this local value will overwritte the global one. + +null:: ++ +You can configure here how null value is represented into your flat +data file. ++ +This parameter is optionnal and defaults to +''+ (that is empty +string). If defined on a table level, this local value will overwritte +the global one. + +empty_string:: ++ +You can configure here how empty values are represented into your flat +data file. ++ +This parameter is optionnal and defaults to '\ ' (that is backslash +followed by space). If defined on a table level, this local value will +overwritte the global one. + +index:: ++ +Table index definition, to be used in blob +UPDATE+'ing. You define an +index column by giving its name and its column number (as found into +your data file, and counting from 1) separated by a colon. If your +table has a composite key, then you can define multiple columns here, +separated by a comma. ++ + index = colname:3, other_colname:5 + +columns:: ++ +You can define here table columns, with the same definition format as +in previous index parameter. ++ +Note you'll have to define here all the columns to be found in data +file, whether you want to use them all or not. When not using them +all, use the +only_cols+ parameter to restrict. ++ +As of +pgloader 2.2+ the column list used might not be the same as the +table columns definition. ++ +As of +pgloader 2.2.1+ you can omit column numbering if you want to, a +counter is then maintained for you, starting from 1 and set to +last +value + 1+ on each column, where +last value+ was either computed or +given in the config. So you can even omit only 'some' columns in +there. ++ + columns = x, y, a, b, d:6, c:5 ++ +In case you have a lot a columns per table, you will want to use +multiple lines for this parameter value. Python ConfigParser module +knows how to read multi-line parameters, you don't have to escape +anything. + +only_cols:: ++ +If you want to only load a part of the columns you have into the data +file, this option let you define which columns you're interrested +in. +only_col+ is a comma separated list of ranges or values, as in +following example. ++ + only_cols = 1-3, 5 ++ +This parameter is optionnal and defaults to the list of all columns +given on the columns parameter list, in the colname order. + +blob_columns:: ++ +The definition of the colums where to find some blob or clob +reference. This definition is composed by a table column name, a +column number (couting from one) reference into the Informix +UNLOAD+ +data file, and a large object type, separated by a colon. You can have +several columns in this field, separated by a comma. ++ +Supported large objects type are Informix blob and clob, the awaited +configuration string are respectively +ifx_blob+ for binary (bytea) +content type and +ifx_clob+ for text type values. ++ +Here's an example: ++ + blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob + +== TEXT FORMAT CONFIGURATION PARAMETERS == + +field_count:: ++ +The +UNLOAD+ command does not escape newlines when they appear into +table data. Hence, you may obtain multi-line data files, where a +single database row (say tuple if you prefer to) can span multiple +physical lines into the unloaded file. ++ +If this is your case, you may want to configure here the number of +columns per tuple. Then pgloader will count columns and buffer line +input in order to re-assemble several physical lines into one data row +when needed. ++ +This parameter is optionnal. + +trailing_sep:: ++ +If this option is set to True, the input data file is known to append +a +field_sep+ as the last character of each of its lines. With this +option set, this last character is then not considered as a field +separator. ++ +This parameter is optionnal and defaults to +False+. + +newline_escapes:: ++ +Sometimes the input data file has field values containing newlines, +and the export program used (as Informix +UNLOAD+ command) escape +in-field newlines. So you want +pgloader+ to keep those newlines, +while at the same time preserving them. ++ +This option does the described work on specified fields and +considering the escaping character you configure, following this +syntax: ++ + newline_escapes = colname:\, other_colname:§ ++ +This parameter is optionnal, and the extra work is only done when +set. You can configure +newline_escapes+ for as many fields as +necessary, and you may configure a different escaping character each +time. ++ +Please note that at the moment, +pgloader+ does only support one +character length +newline_escapes+. ++ +When both a global (see +[pgsql]+ section) +newline_escapes+ parameter +and a table local one are set, +pgloader+ issues a warning and only +consider the global setting. + +== CSV FORMAT CONFIGURATION PARAMETERS == + +doublequote:: + + Controls how instances of +quotechar+ appearing inside a field + should be themselves be quoted. When +True+, the character is + doubled. When +False+, the +escapechar+ is used as a prefix to the + +quotechar+. It defaults to +True+. + +escapechar:: + + A one-character string used by the writer to escape the delimiter + if quoting is set to +QUOTE_NONE+ and the +quotechar+ if + +doublequote+ is +False+. On reading, the +escapechar+ removes any + special meaning from the following character. It defaults to + +None+, which disables escaping. + +quotechar:: + + A one-character string used to quote fields containing special + characters, such as the +delimiter+ or +quotechar+, or which + contain new-line characters. It defaults to '"'. + +skipinitialspace:: + + When +True+, whitespace immediately following the +delimiter+ is + ignored. The default is +False+. + +== CONFIGURATION EXAMPLE == + +Please see the given configuration example which should be distributed in ++/usr/share/doc/pgloader/examples/pgloader.conf+. + +The example configuration file comes with example data and can be used +a unit test of +pgloader+. + +== HISTORY == + ++pgloader+ has first been a +tcl+ tool written by Christopher +Kings-Lynne and Jan Wieck, and then maintained by Jean-Paul +Argudo. When it became clear it would be easier to rewrite it in +another language than to properly learn +tcl+ and add to the project +missing options, +pgloader+ was rewritten in python by Dimitri +Fontaine. + ++pgloader+ was rewritten to act as an Informix to PostgreSQL migration +helper which imported Informix large objects directly into a +PostgreSQL database. + +Then as we got some data we couldn't file tools to care about, we +decided ifx_blob would become +pgloader+, as it had to be able to +import all Informix +UNLOAD+ data. Those data contains escaped +separator into unquoted data field and multi-lines fields (+\r+ and ++\n+ are not escaped). + +== BUGS == + +Please report bugs to Dimitri Fontaine . + +When last line is alone on a +COPY+ command and its parsing ends in +error (not enough columns read for example), no information is given +back by +pgloader+. + +== AUTHORS == + ++pgloader+ is written by Dimitri Fontaine . + diff --git a/pgloader.py b/pgloader.py index dabcec6..c76cd35 100644 --- a/pgloader.py +++ b/pgloader.py @@ -1,5 +1,4 @@ #! /usr/bin/env python -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine """ @@ -165,6 +164,9 @@ def parse_config(conffile): if config.has_option(section, 'client_encoding'): dbconn.client_encoding = config.get(section, 'client_encoding') + if config.has_option(section, 'datestyle'): + dbconn.datestyle = config.get(section, 'datestyle') + if config.has_option(section, 'copy_every'): dbconn.copy_every = config.getint(section, 'copy_every') @@ -174,6 +176,12 @@ def parse_config(conffile): if config.has_option(section, 'copy_delimiter'): dbconn.copy_sep = config.get(section, 'copy_delimiter') + # optionnal global newline_escapes + if config.has_option(section, 'newline_escapes'): + setting = pgloader.tools.parse_config_string( + config.get(section, 'newline_escapes')) + pgloader.options.NEWLINE_ESCAPES = setting + # Then there are null and empty_string optionnal parameters # They canbe overriden in specific table configuration if config.has_option(section, 'null'): @@ -184,12 +192,6 @@ def parse_config(conffile): pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string( config.get(section, 'empty_string')) - # optionnal global newline_escapes - if config.has_option(section, 'newline_escapes'): - setting = pgloader.tools.parse_config_string( - config.get(section, 'newline_escapes')) - pgloader.options.NEWLINE_ESCAPES = setting - except Exception, error: print "Error: Could not initialize PostgreSQL connection:" print error diff --git a/pgloader/csvreader.py b/pgloader/csvreader.py index a857b44..3a155f2 100644 --- a/pgloader/csvreader.py +++ b/pgloader/csvreader.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader text format reader diff --git a/pgloader/db.py b/pgloader/db.py index c981823..35e4266 100644 --- a/pgloader/db.py +++ b/pgloader/db.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader database connection handling @@ -9,7 +8,7 @@ from cStringIO import StringIO from options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC from options import TRUNCATE, VACUUM -from options import INPUT_ENCODING, PG_CLIENT_ENCODING +from options import INPUT_ENCODING, PG_CLIENT_ENCODING, DATESTYLE from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING from tools import PGLoader_Error @@ -40,6 +39,7 @@ class db: self.copy_every = copy_every self.commit_every = commit_every self.client_encoding = client_encoding + self.datestyle = DATESTYLE self.null = NULL self.empty_string = EMPTY_STRING @@ -72,6 +72,22 @@ class db: cursor.execute(sql, [self.client_encoding]) cursor.close() + def set_datestyle(self): + """ set session datestyle to self.datestyle """ + + if self.datestyle is None: + return + + if DEBUG: + # debug only cause reconnecting happens on every + # configured section + print 'Setting datestyle to %s' % self.datestyle + + sql = 'set session datestyle to %s' + cursor = self.dbconn.cursor() + cursor.execute(sql, [self.datestyle]) + cursor.close() + def reset(self): """ reset internal counters and open a new database connection """ self.buffer = None @@ -94,6 +110,7 @@ class db: self.dbconn = psycopg.connect(self.dsn) self.set_encoding() + self.set_datestyle() def print_stats(self): """ output some stats about recent activity """ @@ -339,8 +356,8 @@ class db: except psycopg.DatabaseError, error: # non recoverable error mesg = "\n".join(["Please check PostgreSQL logs", - "HINT: double check your client_encoding" + - " and copy_delimiter settings"]) + "HINT: double check your client_encoding,"+ + " datestyle and copy_delimiter settings"]) raise PGLoader_Error, mesg # prepare next run diff --git a/pgloader/lo.py b/pgloader/lo.py index 5cfe801..3175a9b 100644 --- a/pgloader/lo.py +++ b/pgloader/lo.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader Large Object support diff --git a/pgloader/options.py b/pgloader/options.py index 21ea743..6f5f979 100644 --- a/pgloader/options.py +++ b/pgloader/options.py @@ -1,10 +1,10 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # Some common options, for each module to get them INPUT_ENCODING = None PG_CLIENT_ENCODING = 'latin9' +DATESTYLE = None COPY_SEP = None FIELD_SEP = '|' diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index 85765ae..626b3cf 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader main class @@ -77,6 +76,14 @@ class PGLoader: print "client_encoding: '%s'" % self.db.client_encoding + # optionnal local option datestyle + if config.has_option(name, 'datestyle'): + self.db.datestyle = config.get(name, 'datestyle') + + if DEBUG: + print "datestyle: '%s'" % self.db.datestyle + + ## # data filename for opt in ('table', 'filename'): @@ -252,8 +259,14 @@ class PGLoader: f = self.__dict__[attr] = [] try: + serial = 1 + for field_def in str.split(','): - properties = [x.strip() for x in field_def.split(':')] + if argtype == 'int' and field_def.find(':') == -1: + # support for automatic ordering + properties = [field_def.strip(), serial] + else: + properties = [x.strip() for x in field_def.split(':')] if not btype: # normal column definition, for COPY usage @@ -265,6 +278,10 @@ class PGLoader: # UPDATE usage colname, arg, btype = properties f.append((colname, __getarg(arg, argtype), btype)) + + # update serial + if argtype == 'int': + serial = int(arg) + 1 except Exception, error: # FIXME: make some errors and write some error messages diff --git a/pgloader/reader.py b/pgloader/reader.py index 71aadf5..cbab066 100644 --- a/pgloader/reader.py +++ b/pgloader/reader.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader data reader interface and defaults diff --git a/pgloader/textreader.py b/pgloader/textreader.py index 34304b6..2bfda61 100644 --- a/pgloader/textreader.py +++ b/pgloader/textreader.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader text format reader diff --git a/pgloader/tools.py b/pgloader/tools.py index 7e801d2..bc6e0fe 100644 --- a/pgloader/tools.py +++ b/pgloader/tools.py @@ -1,4 +1,3 @@ -# -*- coding: ISO-8859-15 -*- # Author: Dimitri Fontaine # # pgloader librairies