* Support for datestyle setting

* Support for omiting column numbering
  * Change documentation source format from SGML to asciidoc
  * New version 2.2.1
This commit is contained in:
dim 2007-08-23 10:38:09 +00:00
parent 8ed8219e37
commit 921db51d65
16 changed files with 559 additions and 882 deletions

View File

@ -1,5 +1,4 @@
DOCS = pgloader.1.sgml
GARBAGE = manpage.links manpage.refs
DOCS = pgloader.1.txt
# debian setting
DESTDIR =
@ -18,6 +17,11 @@ install:
cp -a $(libs) $(libdir)/pgloader
cp -a $(examples) $(exdir)
man: $(DOCS)
docbook2man $(DOCS) 2>/dev/null
-rm -f $(GARBAGE)
html: $(DOCS)
asciidoc -a toc $<
pgloader.1.xml: $(DOCS)
asciidoc -d manpage -b docbook $<
man: pgloader.1.xml
xmlto man $<

8
debian/changelog vendored
View File

@ -1,3 +1,11 @@
pgloader (2.2.1) unstable; urgency=low
* Support for datestyle setting
* Support for omiting column numbering
* Change documentation source format from SGML to asciidoc
-- Dimitri Fontaine <dim@tapoueh.org> Thu, 23 Aug 2007 12:35:34 +0200
pgloader (2.2.0) unstable; urgency=low
* Support for partial loading of data (subrange(s) of columns)

2
debian/files vendored
View File

@ -1 +1 @@
pgloader_2.0.2_all.deb misc extra
pgloader_2.2.0_all.deb misc extra

View File

@ -18,6 +18,7 @@ newline_escapes = \
[simple]
table = simple
format = text
datestyle = dmy
filename = simple/simple.data
field_sep = |
trailing_sep = True
@ -75,6 +76,6 @@ format = csv
filename = csv/csv.data
field_sep = ,
quotechar = "
columns = x:1, y:2, a:3, b:4, c:5, d:6
columns = x, y, a, b, d:6, c:5
only_cols = 3-6

View File

@ -1,7 +1,7 @@
1|some first row text|2006-11-11|
2|some second row text|2006-11-11|
3|some third row text|2006-10-12|
2|some second row text|13/11/2006|
3|some third row text|12-10-2006|
4|\ |2006-10-4|
5|some fifth row text|2006-5-12|
6|some sixth row text|2006-7-10|
6|some sixth row text|10/7/6|
7|some null date to play with||

View File

@ -1,853 +0,0 @@
<!doctype refentry PUBLIC "-//OASIS//DTD DocBook V4.1//EN">
<refentry>
<refentryinfo>
<address>
<email>dim@tapoueh.org</email>
</address>
<author>
<firstname>Dimitri</firstname>
<surname>Fontaine</surname>
</author>
<date>August 2006</date>
<copyright>
<year>2006</year>
<holder>Dimitri Fontaine</holder>
</copyright>
</refentryinfo>
<refmeta>
<refentrytitle>pgloader</refentrytitle>
<manvolnum>1</manvolnum>
</refmeta>
<refnamediv>
<refname>pgloader</refname>
<refpurpose>
Import CSV data and Large Object to PostgreSQL
</refpurpose>
</refnamediv>
<refsynopsisdiv>
<cmdsynopsis>
<command>pgloader</command>
<arg><option>-c</option> configuration file</arg>
<arg><option>-p</option> pedantic</arg>
<arg><option>-d</option> debug</arg>
<arg><option>-v</option> verbose</arg>
<arg><option>-n</option> dry run</arg>
<arg><option>-Cn</option> count</arg>
<arg><option>-Fn</option> from</arg>
<arg><option>-In</option> from id</arg>
<arg><option>-E</option> input files encoding</arg>
<arg>Section1 Section2</arg>
</cmdsynopsis>
</refsynopsisdiv>
<refsect1>
<title>DESCRIPTION</title>
<para>
<command>pgloader</command> imports data from a flat file and
insert it into a database table. It uses a flat file per
database table, and you can configure as many Sections as you
want, each one associating a table name and a data file.
</para>
<para>
Data are parsed and rewritten, then given to PostgreSQL
<command>COPY</command> command. Parsing is necessary for
dealing with end of lines and eventual trailing separator
characters, and for column reordering: your flat data file may
not have the same column order as the databse table has.
</para>
<para>
<command>pgloader</command> is also able to load some large
objects data into PostgreSQL, as of now only Informix
<command>UNLOAD</command> data files are supported. This command
gives large objects data location information into the main data
file. <command>pgloader</command> parse it and produces and SQL
UPDATE order per large object, and commit those orders once
every <command>commit_every</command> configuration parameter.
</para>
<para>
<command>pgloader</command> issue some timing statistics
every <command>commit_every</command> commits (see Configuration
for this setting). At the end of each section processing, a
summary of overall operations, numbers of updates and commits,
time it took in seconds, errors logged and database errors is
issued.
</para>
</refsect1>
<refsect1>
<title>OPTIONS</title>
<para>
In order for <command>pgloader</command> to run, you have to
edit a configuration file (see Configuration) consisting of
Section definitions. Each section refers to a PostgreSQL table
into which some data is to be loaded.
</para>
<variablelist>
<varlistentry>
<term><option>-c</option></term>
<term><option>--config</option></term>
<listitem>
<para>
specifies the configuration file to use. The default file
name is <filename>pgloader.conf</filename>, searched into
current working directory.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-p</option></term>
<term><option>--pedantic</option></term>
<listitem>
<para>
activates the <command>pedantic</command> mode, where any
warning is considered as a fatal error, thus stopping the
processing of the input file.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-d</option></term>
<term><option>--debug</option></term>
<listitem>
<para>
makes <command>pgloader</command> say it all about what it
does. debug implies verbose.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-v</option></term>
<term><option>--verbose</option></term>
<listitem>
<para>
makes <command>pgloader</command> very verbose about
what it does.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-n</option></term>
<term><option>--dry-run</option></term>
<listitem>
<para>
makes <command>pgloader</command> simulate operations,
that implies no database connection and no data extraction
from blob files.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-T</option></term>
<term><option>--truncate</option></term>
<listitem>
<para>
makes <command>pgloader</command> issue a truncate SQL
command before importing data.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-V</option></term>
<term><option>--vacuum</option></term>
<listitem>
<para>
makes <command>pgloader</command> issue a vacuum full
verbose analyse SQL command before importing data.
</para>
<para>
This vacuum is run from shell command
<command>/usr/bin/vacuumdb</command> with connection
informations taken from configuration file (see
Configuration section of this manual page), but without
password prompting. If you use this option, please
configure your <filename>pg_hba.conf</filename> in a way
no password is prompted (trust).
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-C</option></term>
<term><option>--count</option></term>
<listitem>
<para>
Number of input lines to process, default is to process
all the input lines.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-F</option></term>
<term><option>--from</option></term>
<listitem>
<para>
Input line number from which we begin to process (and
count). <command>pgloader</command> will skip all
preceding lines.
</para>
<para>
You can't use both <option>-F</option> and
<option>-I</option> at the same time.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-I</option></term>
<term><option>--from-id</option></term>
<listitem>
<para>
From which <command>id</command> do we begin to process
(and count) input lines.
</para>
<para>
When a composite key is used, you have to give each column
of the key separated by comma, on the form col_name=value.
</para>
<para>
Please notice using the <command>--from-id</command>
option implies <command>pgloader</command> will try to get
row id of each row, it being on the interval processed or
not. This could have some performance impact, and you may
end up prefering to use <command>--from</command> instead.
</para>
<para>
Example: <command>pgloader -I col1:val1,col2:val2</command>
</para>
<para>
You can't use both <option>-F</option> and
<option>-I</option> at the same time.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-E</option></term>
<term><option>--encoding</option></term>
<listitem>
<para>
Input data files encoding. Defaults to 'latin9'.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>Section</option></term>
<listitem>
<para>
is the name of a configured Section describing some data
to load
</para>
<para>
Section arguments are optionnal, if no section is given
all configured sections are processed.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>GLOBAL CONFIGURATION SECTION</title>
<para>
The configuration file has a .ini file syntax, its first section
has to be the <command>pgsql</command> one, defining how to
access to the PostgreSQL database server where to load
data. Then you may define any number of sections, each one
describing a data loading task to be performed by
<command>pgloader</command>.
</para>
<para>
The <command>[pgsql]</command> section has the following
options, which all must be set.
</para>
<variablelist>
<varlistentry>
<term><option>host</option></term>
<listitem>
<para>
PostgreSQL database server name, for example
<filename>localhost</filename>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>port</option></term>
<listitem>
<para>
PostgreSQL database server listening port, 5432. You have
to fill this entry.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>base</option></term>
<listitem>
<para>
The name of the database you want to load data into.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>user</option></term>
<listitem>
<para>
Connecting PostgreSQL user name.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>pass</option></term>
<listitem>
<para>
The password of the user. The better is to grant a
<command>trust</command> access privilege in PostgreSQL
<filename>pg_hba.conf</filename>. Then you can set this
entry to whatever value you want to.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>client_encoding</option></term>
<listitem>
<para>
Set this parameter to have <command>pgloader</command>
connects to PostgreSQL using this encoding.
</para>
<para>
This parameter is optionnal and defaults to 'latin9'.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>copy_every</option></term>
<listitem>
<para>
When issuing <command>COPY</command> PostgreSQL commands,
<command>pgloader</command> will not make a single big
COPY attempt, but copy <command>copy_every</command> lines
at a time.
</para>
<para>
This parameter is optionnal and defaults to 10000.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>commit_every</option></term>
<listitem>
<para>
PostgreSQL <command>COMMIT</command> frequency, exprimed
in <command>UPDATE</command> orders. A good value is 1000,
that means commiting the SQL transaction every 1000 input
lines.
</para>
<para>
<command>pgloader</command> issues commit every
commit_every updates, on connection closing and when a SQL
error occurs.
</para>
<para>
This parameter is optionnal and defaults to 1000.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>copy_delimiter</option></term>
<listitem>
<para>
The field separator to use in COPY FROM produced statements. If
you don't specify this, the same separator as the one given in
<command>field_sep</command> parameter will be used.
</para>
<para>
Please note <command>PostgreSQL</command> requires a single char
properly encoded (see your <command>client_encoding</command>
parameter), or it abort in error and even may crash.
</para>
<para>
This parameter is optionnal and defaults to
<command>field_sep</command>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>newline_escapes</option></term>
<listitem>
<para>
For parameter effect description, see below (same name, table
local setting).
</para>
<para>
You can setup here a global escape caracter, to be
considered on each and every column of each and every
text-format table defined thereafter.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>null</option></term>
<listitem>
<para>
You can configure here how <command>null</command> value is
represented into your flat data file.
</para>
<para>
This parameter is optionnal and defaults to
<command>''</command> (that is empty string).
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>empty_string</option></term>
<listitem>
<para>
You can configure here how empty values are represented into
your flat data file.
</para>
<para>
This parameter is optionnal and defaults to <command>'\
'</command> (that is backslash followed by space).
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>COMMON FORMAT CONFIGURATION PARAMETERS</title>
<para>
You then can define any number of data section, and give them an
arbitrary name. Some options are required, some are actually
optionnals, in which case it is said so thereafter.
</para>
<para>
First, we'll go through common parameters, applicable whichever
format of data you're refering to. Then text-format only
parameters will be presented, followed by csv-only parameters.
</para>
<variablelist>
<varlistentry>
<term><option>table</option></term>
<listitem>
<para>
The table name of the database where to load data.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>format</option></term>
<listitem>
<para>
The format data are to be found, either
<command>text</command> or <command>csv</command>.
</para>
<para>
See next sections for format specific options.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>filename</option></term>
<listitem>
<para>
The absolute path to the input data file. The large object
files are to be found into the same directory. Their name
can be in the form [bc]lob[0-9a-f]{4}.[0-9a-f]{3}, but
this information is not used by
<command>pgloader</command>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>reject_log</option></term>
<listitem>
<para>
In case of errors processing input data, a human readable
log per rejected input data line is produced into the
reject_log file.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>reject_data</option></term>
<listitem>
<para>
In case of errors processing input data, the rejected
input line is appended to the reject_data file.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>field_sep</option></term>
<listitem>
<para>
The field separator used into the data file. The same
separator will be used by the generated
<command>COPY</command> commands, thus
<command>pgloader</command> does not have to deal with
escaping the delimiter it uses (input data has to have
escaped it).
</para>
<para>
This parameter is optionnal and defaults to pipe char '|'.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>client_encoding</option></term>
<listitem>
<para>
Set this parameter to have <command>pgloader</command>
connects to PostgreSQL using this encoding.
</para>
<para>
This parameter is optionnal and defaults to 'latin9'. If defined
on a table level, this local value will overwritte the global
one.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>null</option></term>
<listitem>
<para>
You can configure here how <command>null</command> value is
represented into your flat data file.
</para>
<para>
This parameter is optionnal and defaults to
<command>''</command> (that is empty string). If defined on a
table level, this local value will overwritte the global one.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>empty_string</option></term>
<listitem>
<para>
You can configure here how empty values are represented into
your flat data file.
</para>
<para>
This parameter is optionnal and defaults to <command>'\
'</command> (that is backslash followed by space). If defined on
a table level, this local value will overwritte the global one.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>index</option></term>
<listitem>
<para>
Table index definition, to be used in blob UPDATE'ing. You
define an index column by giving its name and its column
number (as found into your data file, and counting from 1)
separated by a colon. If your table has a composite key,
then you can define multiple columns here, separated by a
comma.
</para>
<para>
index = colname:3, other_colname:5
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>columns</option></term>
<listitem>
<para>
You can define here table columns, with the same
definition format as in previous <command>index</command>
parameter.
</para>
<para>
Note you'll have to define here all the columns to be
found in data file, whether you want to use them all or
not. When not using them all, use the
<command>only_cols</command> parameter to restrict.
</para>
<para>
As of <command>pgloader 2.2</command> the column list used
might not be the same as the table columns definition.
</para>
<para>
In case you have a lot a columns per table, you will want
to use multiple lines for this parameter value. Python
<command>ConfigParser</command> module knows how to read
multi-line parameters, you don't have to escape anything.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>only_cols</option></term>
<listitem>
<para>
If you want to only load a part of the columns you have
into the data file, this option let you define which
columns you're interrested in. <command>only_col</command>
is a comma separated list of ranges or values, as in
following example.
</para>
<para>
only_cols = 1-3, 5
</para>
<para>
This parameter is optionnal and defaults to the list of
all columns given on the <command>columns</command>
parameter list, in the colname order.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>blob_columns</option></term>
<listitem>
<para>
The definition of the colums where to find some blob or
clob reference. This definition is composed by a table
column name, a column number (couting from one) reference
into the Informix <command>UNLOAD</command> data file, and
a large object type, separated by a colon. You can have
several columns in this field, separated by a
comma.
</para>
<para>
Supported large objects type are Informix blob and clob,
the awaited configuration string are respectively
<command>ifx_blob</command> for binary (bytea) content
type and <command>ifx_clob</command> for text type values.
</para>
<para>
Here's an example:
</para>
<para>
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>TEXT FORMAT CONFIGURATION PARAMETERS</title>
<variablelist>
<varlistentry>
<term><option>field_count</option></term>
<listitem>
<para>
The <command>UNLOAD</command> command does not escape
newlines when they appear into table data. Hence, you may
obtain multi-line data files, where a single database row
(say tuple if you prefer to) can span multiple physical
lines into the unloaded file.
</para>
<para>
If this is your case, you may want to configure here the
number of columns per tuple. Then
<command>pgloader</command> will count columns and
buffer line input in order to re-assemble several physical
lines into one data row when needed.
</para>
<para>
This parameter is optionnal.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>trailing_sep</option></term>
<listitem>
<para>
If this option is set to <command>True</command>, the
input data file is known to append a
<command>field_sep</command> as the last character of each
of its lines. With this option set, this last character is
then not considered as a field separator.
</para>
<para>
This parameter is optionnal and defaults to False.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>newline_escapes</option></term>
<listitem>
<para>
Sometimes the input data file has field values containing
newlines, and the export program used (as Informix
<command>UNLOAD</command> command) escape in-field
newlines. So you want <command>pgloader</command> to keep
those newlines, while at the same time preserving them.
</para>
<para>
This option does the described work on specified fields
and considering the escaping character you configure,
following this syntax:
</para>
<para>
newline_escapes = colname:\, other_colname:§
</para>
<para>
This parameter is optionnal, and the extra work is only
done when set. You can configure
<command>newline_escapes</command> for as many fields as
necessary, and you may configure a different escaping
character each time.
</para>
<para>
Please note that at the moment,
<command>pgloader</command> does only support one
character length <command>newline_escapes</command>.
</para>
<para>
When both a global (see <command>[pgsql]</command> section)
<command>newline_escapes</command> parameter and a table local
one are set, <command>pgloader</command> issues a warning and
only consider the global setting.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>CSV FORMAT CONFIGURATION PARAMETERS</title>
<variablelist>
<varlistentry>
<term><option>doublequote</option></term>
<listitem>
<para>
Controls how instances of quotechar appearing inside a
field should be themselves be quoted. When True, the
character is doubled. When False, the escapechar is used
as a prefix to the quotechar. It defaults to True.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>escapechar</option></term>
<listitem>
<para>
A one-character string used by the writer to escape the
delimiter if quoting is set to QUOTE_NONE and the
quotechar if doublequote is False. On reading, the
escapechar removes any special meaning from the following
character. It defaults to None, which disables escaping.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>quotechar</option></term>
<listitem>
<para>
A one-character string used to quote fields containing
special characters, such as the delimiter or quotechar, or
which contain new-line characters. It defaults to '"'.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>skipinitialspace</option></term>
<listitem>
<para>
When True, whitespace immediately following the delimiter
is ignored. The default is False.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>CONFIGURATION EXAMPLE</title>
<para>
Please see the given configuration example which should be distributed
in
<filename>/usr/share/doc/pgloader/examples/pgloader.conf</filename>.
</para>
</refsect1>
<refsect1>
<title>HISTORY</title>
<para>
<command>pgloader</command> was at first an Informix to
PostgreSQL migration helper which imported Informix large
objects directly into a PostgreSQL database.
</para>
<para>
Then as we got some data we couldn't file tools to care about,
we decided <command>ifx_blob</command> would become
<command>pgloader</command>, as it had to be able to import all
Informix UNLOAD data. Those data contains escaped separator into
unquoted data field and multi-lines fields (\r and \n are not
escaped).
</para>
</refsect1>
<refsect1>
<title>BUGS</title>
<para>
Please report bugs to Dimitri Fontaine &lt;dim@tapoueh.org&gt;.
</para>
<para>
When last line is alone on a <command>COPY</command> command and its
parsing ends in error (not enough columns read for example), no
information is given back by <command>pgloader</command>.
</para>
</refsect1>
<refsect1>
<title>AUTHORS</title>
<para>
<command>pgloader</command> is written by <author>Dimitri
Fontaine</author> <email>dim@tapoueh.org</email>.
</para>
</refsect1>
</refentry>

486
pgloader.1.txt Normal file
View File

@ -0,0 +1,486 @@
= pgloader(1) =
== NAME ==
pgloader - Import CSV data and Large Object to PostgreSQL
== SYNOPSIS ==
pgloader [-c configuration file] [-p pedantic] [-d debug] [-v verbose]
[-n dryrun] [-Cn count] [-Fn from] [-In from id] [-E input files encoding]
[Section1 Section2]
== DESCRIPTION ==
+pgloader+ imports data from a flat file and insert it into a database
table. It uses a flat file per database table, and you can configure
as many Sections as you want, each one associating a table name and a
data file.
Data are parsed and rewritten, then given to PostgreSQL +COPY+
command. Parsing is necessary for dealing with end of lines and
eventual trailing separator characters, and for column reordering:
your flat data file may not have the same column order as the database
table has.
+pgloader+ is also able to load some large objects data into
PostgreSQL, as of now only Informix +UNLOAD+ data files are
supported. This command gives large objects data location information
into the main data file. +pgloader+ parse it add the +text+ or +bytea+
content properly escaped to the +COPY+ data.
+pgloader+ issue some timing statistics every +commit_every+ commits
(see Configuration for this setting). At the end of each section
processing, a summary of overall operations, numbers of updates and
commits, time it took in seconds, errors logged and database errors is
issued.
+pgloader+ is available from +pgfoundry+ at
http://pgfoundry.org/projects/pgloader/[], where you'll find a debian
package, a source package and an anonymous CVS.
== OPTIONS ==
In order for pgloader to run, you have to edit a configuration file
(see Configuration) consisting of Section definitions. Each section
refers to a PostgreSQL table into which some data is to be loaded.
-c, --config::
specifies the configuration file to use. The default file name is
pgloader.conf, searched into current working directory.
-p, --pedantic::
activates the pedantic mode, where any warning is considered as a fatal
error, thus stopping the processing of the input file.
-d, --debug::
makes pgloader say it all about what it does. debug implies verbose.
-v, --verbose::
makes pgloader very verbose about what it does.
-n, --dry-run::
makes pgloader simulate operations, that implies no database connection and
no data extraction from blob files.
-T, --truncate::
makes pgloader issue a truncate SQL command before importing data.
-V, --vacuum::
+
makes pgloader issue a vacuum full verbose analyse SQL command before
importing data.
+
This vacuum is run from shell command /usr/bin/vacuumdb with
connection informations taken from configuration file (see
Configuration section of this manual page), but without password
prompting. If you use this option, please configure your pg_hba.conf
in a way no password is prompted (trust).
-C, --count::
Number of input lines to process, default is to process all the input
lines.
-F, --from::
+
Input line number from which we begin to process (and count). pgloader
will skip all preceding lines.
+
You can't use both -F and -I at the same time.
-I, --from-id::
+
From which id do we begin to process (and count) input lines.
+
When a composite key is used, you have to give each column of the key
separated by comma, on the form col_name=value.
+
Please notice using the --from-id option implies pgloader will try to
get row id of each row, it being on the interval processed or
not. This could have some performance impact, and you may end up
prefering to use --from instead.
+
Example: pgloader -I col1:val1,col2:val2
+
You can't use both -F and -I at the same time.
-E, --encoding::
Input data files encoding. Defaults to 'latin9'.
Section::
+
is the name of a configured Section describing some data to load
+
Section arguments are optionnal, if no section is given all configured
sections are processed.
== GLOBAL CONFIGURATION SECTION ==
The configuration file has a .ini file syntax, its first section has
to be the pgsql one, defining how to access to the PostgreSQL database
server where to load data. Then you may define any number of sections,
each one describing a data loading task to be performed by pgloader.
The [pgsql] section has the following options, which all must be set.
host::
PostgreSQL database server name, for example localhost.
port::
PostgreSQL database server listening port, 5432. You have to fill this
entry.
base::
The name of the database you want to load data into.
user::
Connecting PostgreSQL user name.
pass::
The password of the user. The better is to grant a trust access privilege
in PostgreSQL pg_hba.conf. Then you can set this entry to whatever value
you want to.
client_encoding::
+
Set this parameter to have pgloader connects to PostgreSQL using this
encoding.
+
This parameter is optionnal and defaults to 'latin9'.
datestyle::
+
Set this parameter to have pgloader connects to PostgreSQL using this
datestyle setting.
+
This parameter is optionnal and has no default value, thus pgloader will
use whatever your PostgreSQL is configured to as default.
copy_every::
+
When issuing +COPY+ PostgreSQL commands, pgloader will not make a
single big +COPY+ attempt, but copy copy_every lines at a time.
+
This parameter is optionnal and defaults to 10000.
commit_every::
+
PostgreSQL +COMMIT+ frequency, exprimed in +UPDATE+ orders. A good
value is 1000, that means commiting the SQL transaction every 1000
input lines.
+
+pgloader+ issues commit every +commit_every+ updates, on connection
closing and when a SQL error occurs.
+
This parameter is optionnal and defaults to 1000.
copy_delimiter::
+
The field separator to use in +COPY FROM+ produced statements. If you
don't specify this, the same separator as the one given in +field_sep+
parameter will be used.
+
Please note PostgreSQL requires a single char properly encoded (see
your +client_encoding+ parameter), or it abort in error and even may
crash.
+
This parameter is optionnal and defaults to +field_sep+.
newline_escapes::
+
For parameter effect description, see below (same name, table local
setting).
+
You can setup here a global escape caracter, to be considered on each
and every column of each and every text-format table defined
thereafter.
null::
+
You can configure here how null value is represented into your flat
data file.
+
This parameter is optionnal and defaults to '' (that is +empty string+).
empty_string::
+
You can configure here how empty values are represented into your flat
data file.
+
This parameter is optionnal and defaults to '\ ' (that is backslash
followed by space).
== COMMON FORMAT CONFIGURATION PARAMETERS ==
You then can define any number of data section, and give them an arbitrary
name. Some options are required, some are actually optionnals, in which case it
is said so thereafter.
First, we'll go through common parameters, applicable whichever format of data
you're refering to. Then text-format only parameters will be presented,
followed by csv-only parameters.
table::
The table name of the database where to load data.
format::
+
The format data are to be found, either text or csv.
+
See next sections for format specific options.
filename::
The absolute path to the input data file. The large object files
are to be found into the same directory. Their name can be in the
form +[bc]lob[0-9a-f]{4}.[0-9a-f]{3}+, but this information is not
used by +pgloader+.
reject_log::
In case of errors processing input data, a human readable log per rejected
input data line is produced into the +reject_log+ file.
reject_data::
In case of errors processing input data, the rejected input line is
appended to the +reject_data+ file.
field_sep::
+
The field separator used into the data file. The same separator will
be used by the generated +COPY+ commands, thus +pgloader+ does not
have to deal with escaping the delimiter it uses (input data has to
have escaped it).
+
This parameter is optionnal and defaults to pipe char '|'.
client_encoding::
+
Set this parameter to have pgloader connects to PostgreSQL using this
encoding.
+
This parameter is optionnal and defaults to 'latin9'. If defined on a
table level, this local value will overwritte the global one.
datestyle::
+
Set this parameter to have pgloader connects to PostgreSQL using this
+datestyle+ setting.
+
This parameter is optionnal and has no default. If defined on a table
level, this local value will overwritte the global one.
null::
+
You can configure here how null value is represented into your flat
data file.
+
This parameter is optionnal and defaults to +''+ (that is empty
string). If defined on a table level, this local value will overwritte
the global one.
empty_string::
+
You can configure here how empty values are represented into your flat
data file.
+
This parameter is optionnal and defaults to '\ ' (that is backslash
followed by space). If defined on a table level, this local value will
overwritte the global one.
index::
+
Table index definition, to be used in blob +UPDATE+'ing. You define an
index column by giving its name and its column number (as found into
your data file, and counting from 1) separated by a colon. If your
table has a composite key, then you can define multiple columns here,
separated by a comma.
+
index = colname:3, other_colname:5
columns::
+
You can define here table columns, with the same definition format as
in previous index parameter.
+
Note you'll have to define here all the columns to be found in data
file, whether you want to use them all or not. When not using them
all, use the +only_cols+ parameter to restrict.
+
As of +pgloader 2.2+ the column list used might not be the same as the
table columns definition.
+
As of +pgloader 2.2.1+ you can omit column numbering if you want to, a
counter is then maintained for you, starting from 1 and set to +last
value + 1+ on each column, where +last value+ was either computed or
given in the config. So you can even omit only 'some' columns in
there.
+
columns = x, y, a, b, d:6, c:5
+
In case you have a lot a columns per table, you will want to use
multiple lines for this parameter value. Python ConfigParser module
knows how to read multi-line parameters, you don't have to escape
anything.
only_cols::
+
If you want to only load a part of the columns you have into the data
file, this option let you define which columns you're interrested
in. +only_col+ is a comma separated list of ranges or values, as in
following example.
+
only_cols = 1-3, 5
+
This parameter is optionnal and defaults to the list of all columns
given on the columns parameter list, in the colname order.
blob_columns::
+
The definition of the colums where to find some blob or clob
reference. This definition is composed by a table column name, a
column number (couting from one) reference into the Informix +UNLOAD+
data file, and a large object type, separated by a colon. You can have
several columns in this field, separated by a comma.
+
Supported large objects type are Informix blob and clob, the awaited
configuration string are respectively +ifx_blob+ for binary (bytea)
content type and +ifx_clob+ for text type values.
+
Here's an example:
+
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
== TEXT FORMAT CONFIGURATION PARAMETERS ==
field_count::
+
The +UNLOAD+ command does not escape newlines when they appear into
table data. Hence, you may obtain multi-line data files, where a
single database row (say tuple if you prefer to) can span multiple
physical lines into the unloaded file.
+
If this is your case, you may want to configure here the number of
columns per tuple. Then pgloader will count columns and buffer line
input in order to re-assemble several physical lines into one data row
when needed.
+
This parameter is optionnal.
trailing_sep::
+
If this option is set to True, the input data file is known to append
a +field_sep+ as the last character of each of its lines. With this
option set, this last character is then not considered as a field
separator.
+
This parameter is optionnal and defaults to +False+.
newline_escapes::
+
Sometimes the input data file has field values containing newlines,
and the export program used (as Informix +UNLOAD+ command) escape
in-field newlines. So you want +pgloader+ to keep those newlines,
while at the same time preserving them.
+
This option does the described work on specified fields and
considering the escaping character you configure, following this
syntax:
+
newline_escapes = colname:\, other_colname:§
+
This parameter is optionnal, and the extra work is only done when
set. You can configure +newline_escapes+ for as many fields as
necessary, and you may configure a different escaping character each
time.
+
Please note that at the moment, +pgloader+ does only support one
character length +newline_escapes+.
+
When both a global (see +[pgsql]+ section) +newline_escapes+ parameter
and a table local one are set, +pgloader+ issues a warning and only
consider the global setting.
== CSV FORMAT CONFIGURATION PARAMETERS ==
doublequote::
Controls how instances of +quotechar+ appearing inside a field
should be themselves be quoted. When +True+, the character is
doubled. When +False+, the +escapechar+ is used as a prefix to the
+quotechar+. It defaults to +True+.
escapechar::
A one-character string used by the writer to escape the delimiter
if quoting is set to +QUOTE_NONE+ and the +quotechar+ if
+doublequote+ is +False+. On reading, the +escapechar+ removes any
special meaning from the following character. It defaults to
+None+, which disables escaping.
quotechar::
A one-character string used to quote fields containing special
characters, such as the +delimiter+ or +quotechar+, or which
contain new-line characters. It defaults to '"'.
skipinitialspace::
When +True+, whitespace immediately following the +delimiter+ is
ignored. The default is +False+.
== CONFIGURATION EXAMPLE ==
Please see the given configuration example which should be distributed in
+/usr/share/doc/pgloader/examples/pgloader.conf+.
The example configuration file comes with example data and can be used
a unit test of +pgloader+.
== HISTORY ==
+pgloader+ has first been a +tcl+ tool written by Christopher
Kings-Lynne and Jan Wieck, and then maintained by Jean-Paul
Argudo. When it became clear it would be easier to rewrite it in
another language than to properly learn +tcl+ and add to the project
missing options, +pgloader+ was rewritten in python by Dimitri
Fontaine.
+pgloader+ was rewritten to act as an Informix to PostgreSQL migration
helper which imported Informix large objects directly into a
PostgreSQL database.
Then as we got some data we couldn't file tools to care about, we
decided ifx_blob would become +pgloader+, as it had to be able to
import all Informix +UNLOAD+ data. Those data contains escaped
separator into unquoted data field and multi-lines fields (+\r+ and
+\n+ are not escaped).
== BUGS ==
Please report bugs to Dimitri Fontaine <dim@tapoueh.org>.
When last line is alone on a +COPY+ command and its parsing ends in
error (not enough columns read for example), no information is given
back by +pgloader+.
== AUTHORS ==
+pgloader+ is written by Dimitri Fontaine <dim@tapoueh.org>.

View File

@ -1,5 +1,4 @@
#! /usr/bin/env python
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
"""
@ -165,6 +164,9 @@ def parse_config(conffile):
if config.has_option(section, 'client_encoding'):
dbconn.client_encoding = config.get(section, 'client_encoding')
if config.has_option(section, 'datestyle'):
dbconn.datestyle = config.get(section, 'datestyle')
if config.has_option(section, 'copy_every'):
dbconn.copy_every = config.getint(section, 'copy_every')
@ -174,6 +176,12 @@ def parse_config(conffile):
if config.has_option(section, 'copy_delimiter'):
dbconn.copy_sep = config.get(section, 'copy_delimiter')
# optionnal global newline_escapes
if config.has_option(section, 'newline_escapes'):
setting = pgloader.tools.parse_config_string(
config.get(section, 'newline_escapes'))
pgloader.options.NEWLINE_ESCAPES = setting
# Then there are null and empty_string optionnal parameters
# They canbe overriden in specific table configuration
if config.has_option(section, 'null'):
@ -184,12 +192,6 @@ def parse_config(conffile):
pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string(
config.get(section, 'empty_string'))
# optionnal global newline_escapes
if config.has_option(section, 'newline_escapes'):
setting = pgloader.tools.parse_config_string(
config.get(section, 'newline_escapes'))
pgloader.options.NEWLINE_ESCAPES = setting
except Exception, error:
print "Error: Could not initialize PostgreSQL connection:"
print error

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# pgloader text format reader

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# pgloader database connection handling
@ -9,7 +8,7 @@ from cStringIO import StringIO
from options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC
from options import TRUNCATE, VACUUM
from options import INPUT_ENCODING, PG_CLIENT_ENCODING
from options import INPUT_ENCODING, PG_CLIENT_ENCODING, DATESTYLE
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
from tools import PGLoader_Error
@ -40,6 +39,7 @@ class db:
self.copy_every = copy_every
self.commit_every = commit_every
self.client_encoding = client_encoding
self.datestyle = DATESTYLE
self.null = NULL
self.empty_string = EMPTY_STRING
@ -72,6 +72,22 @@ class db:
cursor.execute(sql, [self.client_encoding])
cursor.close()
def set_datestyle(self):
""" set session datestyle to self.datestyle """
if self.datestyle is None:
return
if DEBUG:
# debug only cause reconnecting happens on every
# configured section
print 'Setting datestyle to %s' % self.datestyle
sql = 'set session datestyle to %s'
cursor = self.dbconn.cursor()
cursor.execute(sql, [self.datestyle])
cursor.close()
def reset(self):
""" reset internal counters and open a new database connection """
self.buffer = None
@ -94,6 +110,7 @@ class db:
self.dbconn = psycopg.connect(self.dsn)
self.set_encoding()
self.set_datestyle()
def print_stats(self):
""" output some stats about recent activity """
@ -339,8 +356,8 @@ class db:
except psycopg.DatabaseError, error:
# non recoverable error
mesg = "\n".join(["Please check PostgreSQL logs",
"HINT: double check your client_encoding" +
" and copy_delimiter settings"])
"HINT: double check your client_encoding,"+
" datestyle and copy_delimiter settings"])
raise PGLoader_Error, mesg
# prepare next run

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# pgloader Large Object support

View File

@ -1,10 +1,10 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# Some common options, for each module to get them
INPUT_ENCODING = None
PG_CLIENT_ENCODING = 'latin9'
DATESTYLE = None
COPY_SEP = None
FIELD_SEP = '|'

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# pgloader main class
@ -77,6 +76,14 @@ class PGLoader:
print "client_encoding: '%s'" % self.db.client_encoding
# optionnal local option datestyle
if config.has_option(name, 'datestyle'):
self.db.datestyle = config.get(name, 'datestyle')
if DEBUG:
print "datestyle: '%s'" % self.db.datestyle
##
# data filename
for opt in ('table', 'filename'):
@ -252,8 +259,14 @@ class PGLoader:
f = self.__dict__[attr] = []
try:
serial = 1
for field_def in str.split(','):
properties = [x.strip() for x in field_def.split(':')]
if argtype == 'int' and field_def.find(':') == -1:
# support for automatic ordering
properties = [field_def.strip(), serial]
else:
properties = [x.strip() for x in field_def.split(':')]
if not btype:
# normal column definition, for COPY usage
@ -265,6 +278,10 @@ class PGLoader:
# UPDATE usage
colname, arg, btype = properties
f.append((colname, __getarg(arg, argtype), btype))
# update serial
if argtype == 'int':
serial = int(arg) + 1
except Exception, error:
# FIXME: make some errors and write some error messages

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dim@tapoueh.org>
#
# pgloader data reader interface and defaults

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# pgloader text format reader

View File

@ -1,4 +1,3 @@
# -*- coding: ISO-8859-15 -*-
# Author: Dimitri Fontaine <dimitri@dalibo.com>
#
# pgloader librairies