mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-04 10:31:02 +02:00
* Support for datestyle setting
* Support for omiting column numbering * Change documentation source format from SGML to asciidoc * New version 2.2.1
This commit is contained in:
parent
8ed8219e37
commit
921db51d65
14
Makefile
14
Makefile
@ -1,5 +1,4 @@
|
||||
DOCS = pgloader.1.sgml
|
||||
GARBAGE = manpage.links manpage.refs
|
||||
DOCS = pgloader.1.txt
|
||||
|
||||
# debian setting
|
||||
DESTDIR =
|
||||
@ -18,6 +17,11 @@ install:
|
||||
cp -a $(libs) $(libdir)/pgloader
|
||||
cp -a $(examples) $(exdir)
|
||||
|
||||
man: $(DOCS)
|
||||
docbook2man $(DOCS) 2>/dev/null
|
||||
-rm -f $(GARBAGE)
|
||||
html: $(DOCS)
|
||||
asciidoc -a toc $<
|
||||
|
||||
pgloader.1.xml: $(DOCS)
|
||||
asciidoc -d manpage -b docbook $<
|
||||
|
||||
man: pgloader.1.xml
|
||||
xmlto man $<
|
||||
|
||||
8
debian/changelog
vendored
8
debian/changelog
vendored
@ -1,3 +1,11 @@
|
||||
pgloader (2.2.1) unstable; urgency=low
|
||||
|
||||
* Support for datestyle setting
|
||||
* Support for omiting column numbering
|
||||
* Change documentation source format from SGML to asciidoc
|
||||
|
||||
-- Dimitri Fontaine <dim@tapoueh.org> Thu, 23 Aug 2007 12:35:34 +0200
|
||||
|
||||
pgloader (2.2.0) unstable; urgency=low
|
||||
|
||||
* Support for partial loading of data (subrange(s) of columns)
|
||||
|
||||
2
debian/files
vendored
2
debian/files
vendored
@ -1 +1 @@
|
||||
pgloader_2.0.2_all.deb misc extra
|
||||
pgloader_2.2.0_all.deb misc extra
|
||||
|
||||
@ -18,6 +18,7 @@ newline_escapes = \
|
||||
[simple]
|
||||
table = simple
|
||||
format = text
|
||||
datestyle = dmy
|
||||
filename = simple/simple.data
|
||||
field_sep = |
|
||||
trailing_sep = True
|
||||
@ -75,6 +76,6 @@ format = csv
|
||||
filename = csv/csv.data
|
||||
field_sep = ,
|
||||
quotechar = "
|
||||
columns = x:1, y:2, a:3, b:4, c:5, d:6
|
||||
columns = x, y, a, b, d:6, c:5
|
||||
only_cols = 3-6
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
1|some first row text|2006-11-11|
|
||||
2|some second row text|2006-11-11|
|
||||
3|some third row text|2006-10-12|
|
||||
2|some second row text|13/11/2006|
|
||||
3|some third row text|12-10-2006|
|
||||
4|\ |2006-10-4|
|
||||
5|some fifth row text|2006-5-12|
|
||||
6|some sixth row text|2006-7-10|
|
||||
6|some sixth row text|10/7/6|
|
||||
7|some null date to play with||
|
||||
853
pgloader.1.sgml
853
pgloader.1.sgml
@ -1,853 +0,0 @@
|
||||
<!doctype refentry PUBLIC "-//OASIS//DTD DocBook V4.1//EN">
|
||||
<refentry>
|
||||
<refentryinfo>
|
||||
<address>
|
||||
<email>dim@tapoueh.org</email>
|
||||
</address>
|
||||
<author>
|
||||
<firstname>Dimitri</firstname>
|
||||
<surname>Fontaine</surname>
|
||||
</author>
|
||||
<date>August 2006</date>
|
||||
<copyright>
|
||||
<year>2006</year>
|
||||
<holder>Dimitri Fontaine</holder>
|
||||
</copyright>
|
||||
</refentryinfo>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>pgloader</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>pgloader</refname>
|
||||
<refpurpose>
|
||||
Import CSV data and Large Object to PostgreSQL
|
||||
</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsynopsisdiv>
|
||||
<cmdsynopsis>
|
||||
<command>pgloader</command>
|
||||
<arg><option>-c</option> configuration file</arg>
|
||||
<arg><option>-p</option> pedantic</arg>
|
||||
<arg><option>-d</option> debug</arg>
|
||||
<arg><option>-v</option> verbose</arg>
|
||||
<arg><option>-n</option> dry run</arg>
|
||||
<arg><option>-Cn</option> count</arg>
|
||||
<arg><option>-Fn</option> from</arg>
|
||||
<arg><option>-In</option> from id</arg>
|
||||
<arg><option>-E</option> input files encoding</arg>
|
||||
<arg>Section1 Section2</arg>
|
||||
</cmdsynopsis>
|
||||
</refsynopsisdiv>
|
||||
|
||||
<refsect1>
|
||||
<title>DESCRIPTION</title>
|
||||
<para>
|
||||
<command>pgloader</command> imports data from a flat file and
|
||||
insert it into a database table. It uses a flat file per
|
||||
database table, and you can configure as many Sections as you
|
||||
want, each one associating a table name and a data file.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Data are parsed and rewritten, then given to PostgreSQL
|
||||
<command>COPY</command> command. Parsing is necessary for
|
||||
dealing with end of lines and eventual trailing separator
|
||||
characters, and for column reordering: your flat data file may
|
||||
not have the same column order as the databse table has.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<command>pgloader</command> is also able to load some large
|
||||
objects data into PostgreSQL, as of now only Informix
|
||||
<command>UNLOAD</command> data files are supported. This command
|
||||
gives large objects data location information into the main data
|
||||
file. <command>pgloader</command> parse it and produces and SQL
|
||||
UPDATE order per large object, and commit those orders once
|
||||
every <command>commit_every</command> configuration parameter.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<command>pgloader</command> issue some timing statistics
|
||||
every <command>commit_every</command> commits (see Configuration
|
||||
for this setting). At the end of each section processing, a
|
||||
summary of overall operations, numbers of updates and commits,
|
||||
time it took in seconds, errors logged and database errors is
|
||||
issued.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>OPTIONS</title>
|
||||
|
||||
<para>
|
||||
In order for <command>pgloader</command> to run, you have to
|
||||
edit a configuration file (see Configuration) consisting of
|
||||
Section definitions. Each section refers to a PostgreSQL table
|
||||
into which some data is to be loaded.
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>-c</option></term>
|
||||
<term><option>--config</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
specifies the configuration file to use. The default file
|
||||
name is <filename>pgloader.conf</filename>, searched into
|
||||
current working directory.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-p</option></term>
|
||||
<term><option>--pedantic</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
activates the <command>pedantic</command> mode, where any
|
||||
warning is considered as a fatal error, thus stopping the
|
||||
processing of the input file.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-d</option></term>
|
||||
<term><option>--debug</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
makes <command>pgloader</command> say it all about what it
|
||||
does. debug implies verbose.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-v</option></term>
|
||||
<term><option>--verbose</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
makes <command>pgloader</command> very verbose about
|
||||
what it does.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-n</option></term>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
makes <command>pgloader</command> simulate operations,
|
||||
that implies no database connection and no data extraction
|
||||
from blob files.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-T</option></term>
|
||||
<term><option>--truncate</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
makes <command>pgloader</command> issue a truncate SQL
|
||||
command before importing data.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-V</option></term>
|
||||
<term><option>--vacuum</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
makes <command>pgloader</command> issue a vacuum full
|
||||
verbose analyse SQL command before importing data.
|
||||
</para>
|
||||
<para>
|
||||
This vacuum is run from shell command
|
||||
<command>/usr/bin/vacuumdb</command> with connection
|
||||
informations taken from configuration file (see
|
||||
Configuration section of this manual page), but without
|
||||
password prompting. If you use this option, please
|
||||
configure your <filename>pg_hba.conf</filename> in a way
|
||||
no password is prompted (trust).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-C</option></term>
|
||||
<term><option>--count</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Number of input lines to process, default is to process
|
||||
all the input lines.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-F</option></term>
|
||||
<term><option>--from</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Input line number from which we begin to process (and
|
||||
count). <command>pgloader</command> will skip all
|
||||
preceding lines.
|
||||
</para>
|
||||
<para>
|
||||
You can't use both <option>-F</option> and
|
||||
<option>-I</option> at the same time.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-I</option></term>
|
||||
<term><option>--from-id</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
From which <command>id</command> do we begin to process
|
||||
(and count) input lines.
|
||||
</para>
|
||||
<para>
|
||||
When a composite key is used, you have to give each column
|
||||
of the key separated by comma, on the form col_name=value.
|
||||
</para>
|
||||
<para>
|
||||
Please notice using the <command>--from-id</command>
|
||||
option implies <command>pgloader</command> will try to get
|
||||
row id of each row, it being on the interval processed or
|
||||
not. This could have some performance impact, and you may
|
||||
end up prefering to use <command>--from</command> instead.
|
||||
</para>
|
||||
<para>
|
||||
Example: <command>pgloader -I col1:val1,col2:val2</command>
|
||||
</para>
|
||||
<para>
|
||||
You can't use both <option>-F</option> and
|
||||
<option>-I</option> at the same time.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-E</option></term>
|
||||
<term><option>--encoding</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Input data files encoding. Defaults to 'latin9'.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>Section</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
is the name of a configured Section describing some data
|
||||
to load
|
||||
</para>
|
||||
<para>
|
||||
Section arguments are optionnal, if no section is given
|
||||
all configured sections are processed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>GLOBAL CONFIGURATION SECTION</title>
|
||||
<para>
|
||||
The configuration file has a .ini file syntax, its first section
|
||||
has to be the <command>pgsql</command> one, defining how to
|
||||
access to the PostgreSQL database server where to load
|
||||
data. Then you may define any number of sections, each one
|
||||
describing a data loading task to be performed by
|
||||
<command>pgloader</command>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The <command>[pgsql]</command> section has the following
|
||||
options, which all must be set.
|
||||
</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>host</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
PostgreSQL database server name, for example
|
||||
<filename>localhost</filename>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>port</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
PostgreSQL database server listening port, 5432. You have
|
||||
to fill this entry.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>base</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The name of the database you want to load data into.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>user</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Connecting PostgreSQL user name.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>pass</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The password of the user. The better is to grant a
|
||||
<command>trust</command> access privilege in PostgreSQL
|
||||
<filename>pg_hba.conf</filename>. Then you can set this
|
||||
entry to whatever value you want to.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>client_encoding</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Set this parameter to have <command>pgloader</command>
|
||||
connects to PostgreSQL using this encoding.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to 'latin9'.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>copy_every</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
When issuing <command>COPY</command> PostgreSQL commands,
|
||||
<command>pgloader</command> will not make a single big
|
||||
COPY attempt, but copy <command>copy_every</command> lines
|
||||
at a time.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to 10000.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>commit_every</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
PostgreSQL <command>COMMIT</command> frequency, exprimed
|
||||
in <command>UPDATE</command> orders. A good value is 1000,
|
||||
that means commiting the SQL transaction every 1000 input
|
||||
lines.
|
||||
</para>
|
||||
<para>
|
||||
<command>pgloader</command> issues commit every
|
||||
commit_every updates, on connection closing and when a SQL
|
||||
error occurs.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to 1000.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>copy_delimiter</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The field separator to use in COPY FROM produced statements. If
|
||||
you don't specify this, the same separator as the one given in
|
||||
<command>field_sep</command> parameter will be used.
|
||||
</para>
|
||||
<para>
|
||||
Please note <command>PostgreSQL</command> requires a single char
|
||||
properly encoded (see your <command>client_encoding</command>
|
||||
parameter), or it abort in error and even may crash.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to
|
||||
<command>field_sep</command>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>newline_escapes</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
For parameter effect description, see below (same name, table
|
||||
local setting).
|
||||
</para>
|
||||
<para>
|
||||
You can setup here a global escape caracter, to be
|
||||
considered on each and every column of each and every
|
||||
text-format table defined thereafter.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>null</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
You can configure here how <command>null</command> value is
|
||||
represented into your flat data file.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to
|
||||
<command>''</command> (that is empty string).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>empty_string</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
You can configure here how empty values are represented into
|
||||
your flat data file.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to <command>'\
|
||||
'</command> (that is backslash followed by space).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>COMMON FORMAT CONFIGURATION PARAMETERS</title>
|
||||
<para>
|
||||
You then can define any number of data section, and give them an
|
||||
arbitrary name. Some options are required, some are actually
|
||||
optionnals, in which case it is said so thereafter.
|
||||
</para>
|
||||
<para>
|
||||
First, we'll go through common parameters, applicable whichever
|
||||
format of data you're refering to. Then text-format only
|
||||
parameters will be presented, followed by csv-only parameters.
|
||||
</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>table</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The table name of the database where to load data.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>format</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The format data are to be found, either
|
||||
<command>text</command> or <command>csv</command>.
|
||||
</para>
|
||||
<para>
|
||||
See next sections for format specific options.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>filename</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The absolute path to the input data file. The large object
|
||||
files are to be found into the same directory. Their name
|
||||
can be in the form [bc]lob[0-9a-f]{4}.[0-9a-f]{3}, but
|
||||
this information is not used by
|
||||
<command>pgloader</command>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>reject_log</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
In case of errors processing input data, a human readable
|
||||
log per rejected input data line is produced into the
|
||||
reject_log file.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>reject_data</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
In case of errors processing input data, the rejected
|
||||
input line is appended to the reject_data file.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>field_sep</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The field separator used into the data file. The same
|
||||
separator will be used by the generated
|
||||
<command>COPY</command> commands, thus
|
||||
<command>pgloader</command> does not have to deal with
|
||||
escaping the delimiter it uses (input data has to have
|
||||
escaped it).
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to pipe char '|'.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>client_encoding</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Set this parameter to have <command>pgloader</command>
|
||||
connects to PostgreSQL using this encoding.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to 'latin9'. If defined
|
||||
on a table level, this local value will overwritte the global
|
||||
one.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>null</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
You can configure here how <command>null</command> value is
|
||||
represented into your flat data file.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to
|
||||
<command>''</command> (that is empty string). If defined on a
|
||||
table level, this local value will overwritte the global one.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>empty_string</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
You can configure here how empty values are represented into
|
||||
your flat data file.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to <command>'\
|
||||
'</command> (that is backslash followed by space). If defined on
|
||||
a table level, this local value will overwritte the global one.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>index</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Table index definition, to be used in blob UPDATE'ing. You
|
||||
define an index column by giving its name and its column
|
||||
number (as found into your data file, and counting from 1)
|
||||
separated by a colon. If your table has a composite key,
|
||||
then you can define multiple columns here, separated by a
|
||||
comma.
|
||||
</para>
|
||||
<para>
|
||||
index = colname:3, other_colname:5
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>columns</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
You can define here table columns, with the same
|
||||
definition format as in previous <command>index</command>
|
||||
parameter.
|
||||
</para>
|
||||
<para>
|
||||
Note you'll have to define here all the columns to be
|
||||
found in data file, whether you want to use them all or
|
||||
not. When not using them all, use the
|
||||
<command>only_cols</command> parameter to restrict.
|
||||
</para>
|
||||
<para>
|
||||
As of <command>pgloader 2.2</command> the column list used
|
||||
might not be the same as the table columns definition.
|
||||
</para>
|
||||
<para>
|
||||
In case you have a lot a columns per table, you will want
|
||||
to use multiple lines for this parameter value. Python
|
||||
<command>ConfigParser</command> module knows how to read
|
||||
multi-line parameters, you don't have to escape anything.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>only_cols</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If you want to only load a part of the columns you have
|
||||
into the data file, this option let you define which
|
||||
columns you're interrested in. <command>only_col</command>
|
||||
is a comma separated list of ranges or values, as in
|
||||
following example.
|
||||
</para>
|
||||
<para>
|
||||
only_cols = 1-3, 5
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to the list of
|
||||
all columns given on the <command>columns</command>
|
||||
parameter list, in the colname order.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>blob_columns</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The definition of the colums where to find some blob or
|
||||
clob reference. This definition is composed by a table
|
||||
column name, a column number (couting from one) reference
|
||||
into the Informix <command>UNLOAD</command> data file, and
|
||||
a large object type, separated by a colon. You can have
|
||||
several columns in this field, separated by a
|
||||
comma.
|
||||
</para>
|
||||
<para>
|
||||
Supported large objects type are Informix blob and clob,
|
||||
the awaited configuration string are respectively
|
||||
<command>ifx_blob</command> for binary (bytea) content
|
||||
type and <command>ifx_clob</command> for text type values.
|
||||
</para>
|
||||
<para>
|
||||
Here's an example:
|
||||
</para>
|
||||
<para>
|
||||
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>TEXT FORMAT CONFIGURATION PARAMETERS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>field_count</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The <command>UNLOAD</command> command does not escape
|
||||
newlines when they appear into table data. Hence, you may
|
||||
obtain multi-line data files, where a single database row
|
||||
(say tuple if you prefer to) can span multiple physical
|
||||
lines into the unloaded file.
|
||||
</para>
|
||||
<para>
|
||||
If this is your case, you may want to configure here the
|
||||
number of columns per tuple. Then
|
||||
<command>pgloader</command> will count columns and
|
||||
buffer line input in order to re-assemble several physical
|
||||
lines into one data row when needed.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>trailing_sep</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If this option is set to <command>True</command>, the
|
||||
input data file is known to append a
|
||||
<command>field_sep</command> as the last character of each
|
||||
of its lines. With this option set, this last character is
|
||||
then not considered as a field separator.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal and defaults to False.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>newline_escapes</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Sometimes the input data file has field values containing
|
||||
newlines, and the export program used (as Informix
|
||||
<command>UNLOAD</command> command) escape in-field
|
||||
newlines. So you want <command>pgloader</command> to keep
|
||||
those newlines, while at the same time preserving them.
|
||||
</para>
|
||||
<para>
|
||||
This option does the described work on specified fields
|
||||
and considering the escaping character you configure,
|
||||
following this syntax:
|
||||
</para>
|
||||
<para>
|
||||
newline_escapes = colname:\, other_colname:§
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optionnal, and the extra work is only
|
||||
done when set. You can configure
|
||||
<command>newline_escapes</command> for as many fields as
|
||||
necessary, and you may configure a different escaping
|
||||
character each time.
|
||||
</para>
|
||||
<para>
|
||||
Please note that at the moment,
|
||||
<command>pgloader</command> does only support one
|
||||
character length <command>newline_escapes</command>.
|
||||
</para>
|
||||
<para>
|
||||
When both a global (see <command>[pgsql]</command> section)
|
||||
<command>newline_escapes</command> parameter and a table local
|
||||
one are set, <command>pgloader</command> issues a warning and
|
||||
only consider the global setting.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>CSV FORMAT CONFIGURATION PARAMETERS</title>
|
||||
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>doublequote</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Controls how instances of quotechar appearing inside a
|
||||
field should be themselves be quoted. When True, the
|
||||
character is doubled. When False, the escapechar is used
|
||||
as a prefix to the quotechar. It defaults to True.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>escapechar</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
A one-character string used by the writer to escape the
|
||||
delimiter if quoting is set to QUOTE_NONE and the
|
||||
quotechar if doublequote is False. On reading, the
|
||||
escapechar removes any special meaning from the following
|
||||
character. It defaults to None, which disables escaping.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>quotechar</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
A one-character string used to quote fields containing
|
||||
special characters, such as the delimiter or quotechar, or
|
||||
which contain new-line characters. It defaults to '"'.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>skipinitialspace</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
When True, whitespace immediately following the delimiter
|
||||
is ignored. The default is False.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>CONFIGURATION EXAMPLE</title>
|
||||
<para>
|
||||
Please see the given configuration example which should be distributed
|
||||
in
|
||||
<filename>/usr/share/doc/pgloader/examples/pgloader.conf</filename>.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>HISTORY</title>
|
||||
<para>
|
||||
<command>pgloader</command> was at first an Informix to
|
||||
PostgreSQL migration helper which imported Informix large
|
||||
objects directly into a PostgreSQL database.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Then as we got some data we couldn't file tools to care about,
|
||||
we decided <command>ifx_blob</command> would become
|
||||
<command>pgloader</command>, as it had to be able to import all
|
||||
Informix UNLOAD data. Those data contains escaped separator into
|
||||
unquoted data field and multi-lines fields (\r and \n are not
|
||||
escaped).
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>BUGS</title>
|
||||
<para>
|
||||
Please report bugs to Dimitri Fontaine <dim@tapoueh.org>.
|
||||
</para>
|
||||
<para>
|
||||
When last line is alone on a <command>COPY</command> command and its
|
||||
parsing ends in error (not enough columns read for example), no
|
||||
information is given back by <command>pgloader</command>.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>AUTHORS</title>
|
||||
<para>
|
||||
<command>pgloader</command> is written by <author>Dimitri
|
||||
Fontaine</author> <email>dim@tapoueh.org</email>.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
486
pgloader.1.txt
Normal file
486
pgloader.1.txt
Normal file
@ -0,0 +1,486 @@
|
||||
= pgloader(1) =
|
||||
|
||||
== NAME ==
|
||||
|
||||
pgloader - Import CSV data and Large Object to PostgreSQL
|
||||
|
||||
== SYNOPSIS ==
|
||||
|
||||
pgloader [-c configuration file] [-p pedantic] [-d debug] [-v verbose]
|
||||
[-n dryrun] [-Cn count] [-Fn from] [-In from id] [-E input files encoding]
|
||||
[Section1 Section2]
|
||||
|
||||
== DESCRIPTION ==
|
||||
|
||||
+pgloader+ imports data from a flat file and insert it into a database
|
||||
table. It uses a flat file per database table, and you can configure
|
||||
as many Sections as you want, each one associating a table name and a
|
||||
data file.
|
||||
|
||||
Data are parsed and rewritten, then given to PostgreSQL +COPY+
|
||||
command. Parsing is necessary for dealing with end of lines and
|
||||
eventual trailing separator characters, and for column reordering:
|
||||
your flat data file may not have the same column order as the database
|
||||
table has.
|
||||
|
||||
+pgloader+ is also able to load some large objects data into
|
||||
PostgreSQL, as of now only Informix +UNLOAD+ data files are
|
||||
supported. This command gives large objects data location information
|
||||
into the main data file. +pgloader+ parse it add the +text+ or +bytea+
|
||||
content properly escaped to the +COPY+ data.
|
||||
|
||||
+pgloader+ issue some timing statistics every +commit_every+ commits
|
||||
(see Configuration for this setting). At the end of each section
|
||||
processing, a summary of overall operations, numbers of updates and
|
||||
commits, time it took in seconds, errors logged and database errors is
|
||||
issued.
|
||||
|
||||
+pgloader+ is available from +pgfoundry+ at
|
||||
http://pgfoundry.org/projects/pgloader/[], where you'll find a debian
|
||||
package, a source package and an anonymous CVS.
|
||||
|
||||
== OPTIONS ==
|
||||
|
||||
In order for pgloader to run, you have to edit a configuration file
|
||||
(see Configuration) consisting of Section definitions. Each section
|
||||
refers to a PostgreSQL table into which some data is to be loaded.
|
||||
|
||||
-c, --config::
|
||||
|
||||
specifies the configuration file to use. The default file name is
|
||||
pgloader.conf, searched into current working directory.
|
||||
|
||||
-p, --pedantic::
|
||||
|
||||
activates the pedantic mode, where any warning is considered as a fatal
|
||||
error, thus stopping the processing of the input file.
|
||||
|
||||
-d, --debug::
|
||||
|
||||
makes pgloader say it all about what it does. debug implies verbose.
|
||||
|
||||
-v, --verbose::
|
||||
|
||||
makes pgloader very verbose about what it does.
|
||||
|
||||
-n, --dry-run::
|
||||
|
||||
makes pgloader simulate operations, that implies no database connection and
|
||||
no data extraction from blob files.
|
||||
|
||||
-T, --truncate::
|
||||
|
||||
makes pgloader issue a truncate SQL command before importing data.
|
||||
|
||||
-V, --vacuum::
|
||||
+
|
||||
makes pgloader issue a vacuum full verbose analyse SQL command before
|
||||
importing data.
|
||||
+
|
||||
This vacuum is run from shell command /usr/bin/vacuumdb with
|
||||
connection informations taken from configuration file (see
|
||||
Configuration section of this manual page), but without password
|
||||
prompting. If you use this option, please configure your pg_hba.conf
|
||||
in a way no password is prompted (trust).
|
||||
|
||||
-C, --count::
|
||||
|
||||
Number of input lines to process, default is to process all the input
|
||||
lines.
|
||||
|
||||
-F, --from::
|
||||
+
|
||||
Input line number from which we begin to process (and count). pgloader
|
||||
will skip all preceding lines.
|
||||
+
|
||||
You can't use both -F and -I at the same time.
|
||||
|
||||
-I, --from-id::
|
||||
+
|
||||
From which id do we begin to process (and count) input lines.
|
||||
+
|
||||
When a composite key is used, you have to give each column of the key
|
||||
separated by comma, on the form col_name=value.
|
||||
+
|
||||
Please notice using the --from-id option implies pgloader will try to
|
||||
get row id of each row, it being on the interval processed or
|
||||
not. This could have some performance impact, and you may end up
|
||||
prefering to use --from instead.
|
||||
+
|
||||
Example: pgloader -I col1:val1,col2:val2
|
||||
+
|
||||
You can't use both -F and -I at the same time.
|
||||
|
||||
-E, --encoding::
|
||||
|
||||
Input data files encoding. Defaults to 'latin9'.
|
||||
|
||||
Section::
|
||||
+
|
||||
is the name of a configured Section describing some data to load
|
||||
+
|
||||
Section arguments are optionnal, if no section is given all configured
|
||||
sections are processed.
|
||||
|
||||
== GLOBAL CONFIGURATION SECTION ==
|
||||
|
||||
The configuration file has a .ini file syntax, its first section has
|
||||
to be the pgsql one, defining how to access to the PostgreSQL database
|
||||
server where to load data. Then you may define any number of sections,
|
||||
each one describing a data loading task to be performed by pgloader.
|
||||
|
||||
The [pgsql] section has the following options, which all must be set.
|
||||
|
||||
host::
|
||||
|
||||
PostgreSQL database server name, for example localhost.
|
||||
|
||||
port::
|
||||
|
||||
PostgreSQL database server listening port, 5432. You have to fill this
|
||||
entry.
|
||||
|
||||
base::
|
||||
|
||||
The name of the database you want to load data into.
|
||||
|
||||
user::
|
||||
|
||||
Connecting PostgreSQL user name.
|
||||
|
||||
pass::
|
||||
|
||||
The password of the user. The better is to grant a trust access privilege
|
||||
in PostgreSQL pg_hba.conf. Then you can set this entry to whatever value
|
||||
you want to.
|
||||
|
||||
client_encoding::
|
||||
+
|
||||
Set this parameter to have pgloader connects to PostgreSQL using this
|
||||
encoding.
|
||||
+
|
||||
This parameter is optionnal and defaults to 'latin9'.
|
||||
|
||||
datestyle::
|
||||
+
|
||||
Set this parameter to have pgloader connects to PostgreSQL using this
|
||||
datestyle setting.
|
||||
+
|
||||
This parameter is optionnal and has no default value, thus pgloader will
|
||||
use whatever your PostgreSQL is configured to as default.
|
||||
|
||||
copy_every::
|
||||
+
|
||||
When issuing +COPY+ PostgreSQL commands, pgloader will not make a
|
||||
single big +COPY+ attempt, but copy copy_every lines at a time.
|
||||
+
|
||||
This parameter is optionnal and defaults to 10000.
|
||||
|
||||
commit_every::
|
||||
+
|
||||
PostgreSQL +COMMIT+ frequency, exprimed in +UPDATE+ orders. A good
|
||||
value is 1000, that means commiting the SQL transaction every 1000
|
||||
input lines.
|
||||
+
|
||||
+pgloader+ issues commit every +commit_every+ updates, on connection
|
||||
closing and when a SQL error occurs.
|
||||
+
|
||||
This parameter is optionnal and defaults to 1000.
|
||||
|
||||
copy_delimiter::
|
||||
+
|
||||
The field separator to use in +COPY FROM+ produced statements. If you
|
||||
don't specify this, the same separator as the one given in +field_sep+
|
||||
parameter will be used.
|
||||
+
|
||||
Please note PostgreSQL requires a single char properly encoded (see
|
||||
your +client_encoding+ parameter), or it abort in error and even may
|
||||
crash.
|
||||
+
|
||||
This parameter is optionnal and defaults to +field_sep+.
|
||||
|
||||
newline_escapes::
|
||||
+
|
||||
For parameter effect description, see below (same name, table local
|
||||
setting).
|
||||
+
|
||||
You can setup here a global escape caracter, to be considered on each
|
||||
and every column of each and every text-format table defined
|
||||
thereafter.
|
||||
|
||||
null::
|
||||
+
|
||||
You can configure here how null value is represented into your flat
|
||||
data file.
|
||||
+
|
||||
This parameter is optionnal and defaults to '' (that is +empty string+).
|
||||
|
||||
empty_string::
|
||||
+
|
||||
You can configure here how empty values are represented into your flat
|
||||
data file.
|
||||
+
|
||||
This parameter is optionnal and defaults to '\ ' (that is backslash
|
||||
followed by space).
|
||||
|
||||
|
||||
== COMMON FORMAT CONFIGURATION PARAMETERS ==
|
||||
|
||||
You then can define any number of data section, and give them an arbitrary
|
||||
name. Some options are required, some are actually optionnals, in which case it
|
||||
is said so thereafter.
|
||||
|
||||
First, we'll go through common parameters, applicable whichever format of data
|
||||
you're refering to. Then text-format only parameters will be presented,
|
||||
followed by csv-only parameters.
|
||||
|
||||
table::
|
||||
|
||||
The table name of the database where to load data.
|
||||
|
||||
format::
|
||||
+
|
||||
The format data are to be found, either text or csv.
|
||||
+
|
||||
See next sections for format specific options.
|
||||
|
||||
filename::
|
||||
|
||||
The absolute path to the input data file. The large object files
|
||||
are to be found into the same directory. Their name can be in the
|
||||
form +[bc]lob[0-9a-f]{4}.[0-9a-f]{3}+, but this information is not
|
||||
used by +pgloader+.
|
||||
|
||||
reject_log::
|
||||
|
||||
In case of errors processing input data, a human readable log per rejected
|
||||
input data line is produced into the +reject_log+ file.
|
||||
|
||||
reject_data::
|
||||
|
||||
In case of errors processing input data, the rejected input line is
|
||||
appended to the +reject_data+ file.
|
||||
|
||||
field_sep::
|
||||
+
|
||||
The field separator used into the data file. The same separator will
|
||||
be used by the generated +COPY+ commands, thus +pgloader+ does not
|
||||
have to deal with escaping the delimiter it uses (input data has to
|
||||
have escaped it).
|
||||
+
|
||||
This parameter is optionnal and defaults to pipe char '|'.
|
||||
|
||||
client_encoding::
|
||||
+
|
||||
Set this parameter to have pgloader connects to PostgreSQL using this
|
||||
encoding.
|
||||
+
|
||||
This parameter is optionnal and defaults to 'latin9'. If defined on a
|
||||
table level, this local value will overwritte the global one.
|
||||
|
||||
datestyle::
|
||||
+
|
||||
Set this parameter to have pgloader connects to PostgreSQL using this
|
||||
+datestyle+ setting.
|
||||
+
|
||||
This parameter is optionnal and has no default. If defined on a table
|
||||
level, this local value will overwritte the global one.
|
||||
|
||||
null::
|
||||
+
|
||||
You can configure here how null value is represented into your flat
|
||||
data file.
|
||||
+
|
||||
This parameter is optionnal and defaults to +''+ (that is empty
|
||||
string). If defined on a table level, this local value will overwritte
|
||||
the global one.
|
||||
|
||||
empty_string::
|
||||
+
|
||||
You can configure here how empty values are represented into your flat
|
||||
data file.
|
||||
+
|
||||
This parameter is optionnal and defaults to '\ ' (that is backslash
|
||||
followed by space). If defined on a table level, this local value will
|
||||
overwritte the global one.
|
||||
|
||||
index::
|
||||
+
|
||||
Table index definition, to be used in blob +UPDATE+'ing. You define an
|
||||
index column by giving its name and its column number (as found into
|
||||
your data file, and counting from 1) separated by a colon. If your
|
||||
table has a composite key, then you can define multiple columns here,
|
||||
separated by a comma.
|
||||
+
|
||||
index = colname:3, other_colname:5
|
||||
|
||||
columns::
|
||||
+
|
||||
You can define here table columns, with the same definition format as
|
||||
in previous index parameter.
|
||||
+
|
||||
Note you'll have to define here all the columns to be found in data
|
||||
file, whether you want to use them all or not. When not using them
|
||||
all, use the +only_cols+ parameter to restrict.
|
||||
+
|
||||
As of +pgloader 2.2+ the column list used might not be the same as the
|
||||
table columns definition.
|
||||
+
|
||||
As of +pgloader 2.2.1+ you can omit column numbering if you want to, a
|
||||
counter is then maintained for you, starting from 1 and set to +last
|
||||
value + 1+ on each column, where +last value+ was either computed or
|
||||
given in the config. So you can even omit only 'some' columns in
|
||||
there.
|
||||
+
|
||||
columns = x, y, a, b, d:6, c:5
|
||||
+
|
||||
In case you have a lot a columns per table, you will want to use
|
||||
multiple lines for this parameter value. Python ConfigParser module
|
||||
knows how to read multi-line parameters, you don't have to escape
|
||||
anything.
|
||||
|
||||
only_cols::
|
||||
+
|
||||
If you want to only load a part of the columns you have into the data
|
||||
file, this option let you define which columns you're interrested
|
||||
in. +only_col+ is a comma separated list of ranges or values, as in
|
||||
following example.
|
||||
+
|
||||
only_cols = 1-3, 5
|
||||
+
|
||||
This parameter is optionnal and defaults to the list of all columns
|
||||
given on the columns parameter list, in the colname order.
|
||||
|
||||
blob_columns::
|
||||
+
|
||||
The definition of the colums where to find some blob or clob
|
||||
reference. This definition is composed by a table column name, a
|
||||
column number (couting from one) reference into the Informix +UNLOAD+
|
||||
data file, and a large object type, separated by a colon. You can have
|
||||
several columns in this field, separated by a comma.
|
||||
+
|
||||
Supported large objects type are Informix blob and clob, the awaited
|
||||
configuration string are respectively +ifx_blob+ for binary (bytea)
|
||||
content type and +ifx_clob+ for text type values.
|
||||
+
|
||||
Here's an example:
|
||||
+
|
||||
blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
|
||||
|
||||
== TEXT FORMAT CONFIGURATION PARAMETERS ==
|
||||
|
||||
field_count::
|
||||
+
|
||||
The +UNLOAD+ command does not escape newlines when they appear into
|
||||
table data. Hence, you may obtain multi-line data files, where a
|
||||
single database row (say tuple if you prefer to) can span multiple
|
||||
physical lines into the unloaded file.
|
||||
+
|
||||
If this is your case, you may want to configure here the number of
|
||||
columns per tuple. Then pgloader will count columns and buffer line
|
||||
input in order to re-assemble several physical lines into one data row
|
||||
when needed.
|
||||
+
|
||||
This parameter is optionnal.
|
||||
|
||||
trailing_sep::
|
||||
+
|
||||
If this option is set to True, the input data file is known to append
|
||||
a +field_sep+ as the last character of each of its lines. With this
|
||||
option set, this last character is then not considered as a field
|
||||
separator.
|
||||
+
|
||||
This parameter is optionnal and defaults to +False+.
|
||||
|
||||
newline_escapes::
|
||||
+
|
||||
Sometimes the input data file has field values containing newlines,
|
||||
and the export program used (as Informix +UNLOAD+ command) escape
|
||||
in-field newlines. So you want +pgloader+ to keep those newlines,
|
||||
while at the same time preserving them.
|
||||
+
|
||||
This option does the described work on specified fields and
|
||||
considering the escaping character you configure, following this
|
||||
syntax:
|
||||
+
|
||||
newline_escapes = colname:\, other_colname:§
|
||||
+
|
||||
This parameter is optionnal, and the extra work is only done when
|
||||
set. You can configure +newline_escapes+ for as many fields as
|
||||
necessary, and you may configure a different escaping character each
|
||||
time.
|
||||
+
|
||||
Please note that at the moment, +pgloader+ does only support one
|
||||
character length +newline_escapes+.
|
||||
+
|
||||
When both a global (see +[pgsql]+ section) +newline_escapes+ parameter
|
||||
and a table local one are set, +pgloader+ issues a warning and only
|
||||
consider the global setting.
|
||||
|
||||
== CSV FORMAT CONFIGURATION PARAMETERS ==
|
||||
|
||||
doublequote::
|
||||
|
||||
Controls how instances of +quotechar+ appearing inside a field
|
||||
should be themselves be quoted. When +True+, the character is
|
||||
doubled. When +False+, the +escapechar+ is used as a prefix to the
|
||||
+quotechar+. It defaults to +True+.
|
||||
|
||||
escapechar::
|
||||
|
||||
A one-character string used by the writer to escape the delimiter
|
||||
if quoting is set to +QUOTE_NONE+ and the +quotechar+ if
|
||||
+doublequote+ is +False+. On reading, the +escapechar+ removes any
|
||||
special meaning from the following character. It defaults to
|
||||
+None+, which disables escaping.
|
||||
|
||||
quotechar::
|
||||
|
||||
A one-character string used to quote fields containing special
|
||||
characters, such as the +delimiter+ or +quotechar+, or which
|
||||
contain new-line characters. It defaults to '"'.
|
||||
|
||||
skipinitialspace::
|
||||
|
||||
When +True+, whitespace immediately following the +delimiter+ is
|
||||
ignored. The default is +False+.
|
||||
|
||||
== CONFIGURATION EXAMPLE ==
|
||||
|
||||
Please see the given configuration example which should be distributed in
|
||||
+/usr/share/doc/pgloader/examples/pgloader.conf+.
|
||||
|
||||
The example configuration file comes with example data and can be used
|
||||
a unit test of +pgloader+.
|
||||
|
||||
== HISTORY ==
|
||||
|
||||
+pgloader+ has first been a +tcl+ tool written by Christopher
|
||||
Kings-Lynne and Jan Wieck, and then maintained by Jean-Paul
|
||||
Argudo. When it became clear it would be easier to rewrite it in
|
||||
another language than to properly learn +tcl+ and add to the project
|
||||
missing options, +pgloader+ was rewritten in python by Dimitri
|
||||
Fontaine.
|
||||
|
||||
+pgloader+ was rewritten to act as an Informix to PostgreSQL migration
|
||||
helper which imported Informix large objects directly into a
|
||||
PostgreSQL database.
|
||||
|
||||
Then as we got some data we couldn't file tools to care about, we
|
||||
decided ifx_blob would become +pgloader+, as it had to be able to
|
||||
import all Informix +UNLOAD+ data. Those data contains escaped
|
||||
separator into unquoted data field and multi-lines fields (+\r+ and
|
||||
+\n+ are not escaped).
|
||||
|
||||
== BUGS ==
|
||||
|
||||
Please report bugs to Dimitri Fontaine <dim@tapoueh.org>.
|
||||
|
||||
When last line is alone on a +COPY+ command and its parsing ends in
|
||||
error (not enough columns read for example), no information is given
|
||||
back by +pgloader+.
|
||||
|
||||
== AUTHORS ==
|
||||
|
||||
+pgloader+ is written by Dimitri Fontaine <dim@tapoueh.org>.
|
||||
|
||||
16
pgloader.py
16
pgloader.py
@ -1,5 +1,4 @@
|
||||
#! /usr/bin/env python
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
|
||||
"""
|
||||
@ -165,6 +164,9 @@ def parse_config(conffile):
|
||||
if config.has_option(section, 'client_encoding'):
|
||||
dbconn.client_encoding = config.get(section, 'client_encoding')
|
||||
|
||||
if config.has_option(section, 'datestyle'):
|
||||
dbconn.datestyle = config.get(section, 'datestyle')
|
||||
|
||||
if config.has_option(section, 'copy_every'):
|
||||
dbconn.copy_every = config.getint(section, 'copy_every')
|
||||
|
||||
@ -174,6 +176,12 @@ def parse_config(conffile):
|
||||
if config.has_option(section, 'copy_delimiter'):
|
||||
dbconn.copy_sep = config.get(section, 'copy_delimiter')
|
||||
|
||||
# optionnal global newline_escapes
|
||||
if config.has_option(section, 'newline_escapes'):
|
||||
setting = pgloader.tools.parse_config_string(
|
||||
config.get(section, 'newline_escapes'))
|
||||
pgloader.options.NEWLINE_ESCAPES = setting
|
||||
|
||||
# Then there are null and empty_string optionnal parameters
|
||||
# They canbe overriden in specific table configuration
|
||||
if config.has_option(section, 'null'):
|
||||
@ -184,12 +192,6 @@ def parse_config(conffile):
|
||||
pgloader.options.EMPTY_STRING = pgloader.tools.parse_config_string(
|
||||
config.get(section, 'empty_string'))
|
||||
|
||||
# optionnal global newline_escapes
|
||||
if config.has_option(section, 'newline_escapes'):
|
||||
setting = pgloader.tools.parse_config_string(
|
||||
config.get(section, 'newline_escapes'))
|
||||
pgloader.options.NEWLINE_ESCAPES = setting
|
||||
|
||||
except Exception, error:
|
||||
print "Error: Could not initialize PostgreSQL connection:"
|
||||
print error
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# pgloader text format reader
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# pgloader database connection handling
|
||||
@ -9,7 +8,7 @@ from cStringIO import StringIO
|
||||
|
||||
from options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC
|
||||
from options import TRUNCATE, VACUUM
|
||||
from options import INPUT_ENCODING, PG_CLIENT_ENCODING
|
||||
from options import INPUT_ENCODING, PG_CLIENT_ENCODING, DATESTYLE
|
||||
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
|
||||
|
||||
from tools import PGLoader_Error
|
||||
@ -40,6 +39,7 @@ class db:
|
||||
self.copy_every = copy_every
|
||||
self.commit_every = commit_every
|
||||
self.client_encoding = client_encoding
|
||||
self.datestyle = DATESTYLE
|
||||
self.null = NULL
|
||||
self.empty_string = EMPTY_STRING
|
||||
|
||||
@ -72,6 +72,22 @@ class db:
|
||||
cursor.execute(sql, [self.client_encoding])
|
||||
cursor.close()
|
||||
|
||||
def set_datestyle(self):
|
||||
""" set session datestyle to self.datestyle """
|
||||
|
||||
if self.datestyle is None:
|
||||
return
|
||||
|
||||
if DEBUG:
|
||||
# debug only cause reconnecting happens on every
|
||||
# configured section
|
||||
print 'Setting datestyle to %s' % self.datestyle
|
||||
|
||||
sql = 'set session datestyle to %s'
|
||||
cursor = self.dbconn.cursor()
|
||||
cursor.execute(sql, [self.datestyle])
|
||||
cursor.close()
|
||||
|
||||
def reset(self):
|
||||
""" reset internal counters and open a new database connection """
|
||||
self.buffer = None
|
||||
@ -94,6 +110,7 @@ class db:
|
||||
|
||||
self.dbconn = psycopg.connect(self.dsn)
|
||||
self.set_encoding()
|
||||
self.set_datestyle()
|
||||
|
||||
def print_stats(self):
|
||||
""" output some stats about recent activity """
|
||||
@ -339,8 +356,8 @@ class db:
|
||||
except psycopg.DatabaseError, error:
|
||||
# non recoverable error
|
||||
mesg = "\n".join(["Please check PostgreSQL logs",
|
||||
"HINT: double check your client_encoding" +
|
||||
" and copy_delimiter settings"])
|
||||
"HINT: double check your client_encoding,"+
|
||||
" datestyle and copy_delimiter settings"])
|
||||
raise PGLoader_Error, mesg
|
||||
|
||||
# prepare next run
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# pgloader Large Object support
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# Some common options, for each module to get them
|
||||
|
||||
INPUT_ENCODING = None
|
||||
PG_CLIENT_ENCODING = 'latin9'
|
||||
DATESTYLE = None
|
||||
|
||||
COPY_SEP = None
|
||||
FIELD_SEP = '|'
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# pgloader main class
|
||||
@ -77,6 +76,14 @@ class PGLoader:
|
||||
print "client_encoding: '%s'" % self.db.client_encoding
|
||||
|
||||
|
||||
# optionnal local option datestyle
|
||||
if config.has_option(name, 'datestyle'):
|
||||
self.db.datestyle = config.get(name, 'datestyle')
|
||||
|
||||
if DEBUG:
|
||||
print "datestyle: '%s'" % self.db.datestyle
|
||||
|
||||
|
||||
##
|
||||
# data filename
|
||||
for opt in ('table', 'filename'):
|
||||
@ -252,8 +259,14 @@ class PGLoader:
|
||||
f = self.__dict__[attr] = []
|
||||
|
||||
try:
|
||||
serial = 1
|
||||
|
||||
for field_def in str.split(','):
|
||||
properties = [x.strip() for x in field_def.split(':')]
|
||||
if argtype == 'int' and field_def.find(':') == -1:
|
||||
# support for automatic ordering
|
||||
properties = [field_def.strip(), serial]
|
||||
else:
|
||||
properties = [x.strip() for x in field_def.split(':')]
|
||||
|
||||
if not btype:
|
||||
# normal column definition, for COPY usage
|
||||
@ -265,6 +278,10 @@ class PGLoader:
|
||||
# UPDATE usage
|
||||
colname, arg, btype = properties
|
||||
f.append((colname, __getarg(arg, argtype), btype))
|
||||
|
||||
# update serial
|
||||
if argtype == 'int':
|
||||
serial = int(arg) + 1
|
||||
|
||||
except Exception, error:
|
||||
# FIXME: make some errors and write some error messages
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dim@tapoueh.org>
|
||||
#
|
||||
# pgloader data reader interface and defaults
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# pgloader text format reader
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# -*- coding: ISO-8859-15 -*-
|
||||
# Author: Dimitri Fontaine <dimitri@dalibo.com>
|
||||
#
|
||||
# pgloader librairies
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user