CSV support, can load not-all-columns of data files, can load to table with more cols than data file

2025-08-08 07:16:58 +02:00 · 2007-06-04 12:14:18 +00:00 · 2007-06-04 12:14:18 +00:00 · 8ed8219e37
commit 8ed8219e37
parent 29934d7112
13 changed files with 868 additions and 471 deletions
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,18 @@
 pgloader (2.2.0) unstable; urgency=low
  * Support for partial loading of data (subrange(s) of columns)
  * COPY table (col1, col2, ..., coln) systematically used
  * Support for CSV format (with quoting)
 -- Dimitri Fontaine <dim@tapoueh.org>  Mon, 04 Jun 2007 11:13:21 +0200
 pgloader (2.1.0) unstable; urgency=low
  * Added support for partial COPY table definition
  * Documentation and example update (see serial)
 -- Dimitri Fontaine <dim@dalibo.com>  Fri, 19 Jan 2007 12:25:39 +0100
 pgloader (2.0.2) unstable; urgency=low
  * configurable null and empty_string representations
--- a/examples/README
+++ b/examples/README
@ -20,8 +20,10 @@ The provided examples are:
 . errors
-  Same test, but with impossible dates. Should report some errors. It does
+  Same test, but with impossible dates. Should report some errors. If it
-not report errors, check you're not using psycopg 1.1.21.
+  does not report errors, check you're not using psycopg 1.1.21.
  Should report 3 errors out of 7 lines (4 updates).
 . clob
@ -33,23 +35,38 @@ not report errors, check you're not using psycopg 1.1.21.
  A dataset with newline escaped and multi-line input (without quoting)
  Beware of data reordering, too.
 . csv
  A dataset with csv delimiter ',' and quoting '"'.
 . partial
  A dataset from which we only load some columns of the provided one.
 . serial
  In this dataset the id field is ommited, it's a serial which will be
  automatically set by PostgreSQL while COPYing.
 You can launch all those pgloader tests in one run, provided you created the
 necessary tables:
- $ for test in simple clob cluttured; do psql pgloader < $test/$test.sql; done
+ $ for sql in */*sql; do psql pgloader < $sql; done
 $ ../pgloader.py -Tc pgloader.conf
 [...]
 Table name        |    duration |    size |    updates |     errors
 ====================================================================
- clob              |      0.121s |   32 kB |          7 |          0
+ clob              |      0.041s |   32 kB |          7 |          0
- cluttered         |      0.041s |   32 kB |          3 |          0
+ cluttered         |      0.037s |   32 kB |          6 |          0
- simple            |      0.040s |   16 kB |          6 |          0
+ csv               |      0.019s |   16 kB |          6 |          0
 errors            |      0.032s |   32 kB |          4 |          3
 partial           |      0.024s |   32 kB |          7 |          0
 serial            |      0.028s |   32 kB |          7 |          0
 simple            |      0.029s |   32 kB |          7 |          0
 ====================================================================
- Total             |      0.369s |   80 kB |         16 |          0
+ Total             |      0.210s |  208 kB |         44 |          3
-
+
 And you then have a nice summary.
--- a/examples/csv/csv.data
+++ b/examples/csv/csv.data
@ -0,0 +1,6 @@
 "2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom"
 "3.0.0.0","4.17.135.31","50331648","68257567","US","United States"
 "4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada"
 "4.17.135.64","4.17.142.255","68257600","68259583","US","United States"
 "4.17.143.0","4.17.143.15","68259584","68259599","CA","Canada"
 "4.17.143.16","4.18.32.71","68259600","68296775","US","United States"
--- a/examples/csv/csv.sql
+++ b/examples/csv/csv.sql
@ -0,0 +1,6 @@
 CREATE TABLE csv (
 a bigint,
 b bigint,
 c char(2),
 d text
 );
--- a/examples/partial/partial.data
+++ b/examples/partial/partial.data
@ -0,0 +1,7 @@
 1%foo%bar%baz%hop
 2%foo%bar%baz%hop
 3%foo%bar%baz%hop
 4%foo%bar%baz%hop
 5%foo%bar%baz%hop
 6%foo%bar%baz%hop
 7%foo%bar%baz%hop
--- a/examples/partial/partial.sql
+++ b/examples/partial/partial.sql
@ -0,0 +1,7 @@
 CREATE TABLE partial (
 a integer primary key,
 b text,
 c text,
 d text,
 e text
 );
--- a/examples/pgloader.conf
+++ b/examples/pgloader.conf
@ -17,6 +17,7 @@ newline_escapes = \
 [simple]
 table        = simple
 format       = text
 filename     = simple/simple.data
 field_sep    = |
 trailing_sep = True
@ -28,6 +29,7 @@ reject_data  = /tmp/simple.rej
 [errors]
 table        = errors
 format       = text
 filename     = errors/errors.data
 field_sep    = |
 trailing_sep = True
@ -35,6 +37,7 @@ columns      = a:1, b:3, c:2
 [clob]
 table        = clob
 format       = text
 filename     = clob/clob.data
 field_sep    = |
 columns      = a:1, b:2
@ -43,6 +46,7 @@ blob_columns = b:2:ifx_clob
 [cluttered]
 table           = cluttered
 format       = text
 filename        = cluttered/cluttered.data
 field_sep       = ^
 trailing_sep    = True
@ -52,7 +56,25 @@ columns         = a:1, b:3, c:2
 [serial]
 table        = serial
 format       = text
 filename     = serial/serial.data
 field_sep    = ;
 partial_copy = True
 columns      = b:2, c:1
 [partial]
 table        = partial
 format       = text
 filename     = partial/partial.data
 field_sep    = %
 columns      = a:1, b:2, c:3, d:4, e:5
 only_cols    = 1-3, 5
 [csv]
 table        = csv
 format       = csv
 filename     = csv/csv.data
 field_sep    = ,
 quotechar    = "
 columns      = x:1, y:2, a:3, b:4, c:5, d:6
 only_cols    = 3-6
--- a/pgloader.1.sgml
+++ b/pgloader.1.sgml
@ -2,7 +2,7 @@
 <refentry>
  <refentryinfo>
    <address>
-      <email>dim@dalibo.com</email>
+      <email>dim@tapoueh.org</email>
    </address>
    <author>
      <firstname>Dimitri</firstname>
@ -263,7 +263,7 @@ Import CSV data and Large Object to PostgreSQL
  </refsect1>
  <refsect1>
-    <title>CONFIGURATION</title>
+    <title>GLOBAL CONFIGURATION SECTION</title>
    <para>
      The configuration file has a .ini file syntax, its first section
      has to be the <command>pgsql</command> one, defining how to
@ -404,9 +404,9 @@ Import CSV data and Large Object to PostgreSQL
 	    local setting).
 	  </para>
          <para>
-	    You can setup here a global escape caracter, to be considered on
+	    You can setup here a global escape caracter, to be
-	    each and every column of each and every table defined
+	    considered on each and every column of each and every
-	    thereafter.
+	    text-format table defined thereafter.
 	  </para>
 	</listitem>
      </varlistentry>
@ -439,12 +439,20 @@ Import CSV data and Large Object to PostgreSQL
        </listitem>
      </varlistentry>
    </variablelist>
  </refsect1>
  <refsect1>
    <title>COMMON FORMAT CONFIGURATION PARAMETERS</title>
    <para>
      You then can define any number of data section, and give them an
      arbitrary name. Some options are required, some are actually
      optionnals, in which case it is said so thereafter.
    </para>
    <para>
      First, we'll go through common parameters, applicable whichever
      format of data you're refering to. Then text-format only
      parameters will be presented, followed by csv-only parameters.
    </para>
    <variablelist>
      <varlistentry>
        <term><option>table</option></term>
@ -455,6 +463,19 @@ Import CSV data and Large Object to PostgreSQL
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>format</option></term>
        <listitem>
          <para>
 	    The format data are to be found, either
 	    <command>text</command> or <command>csv</command>.
 	  </para>
 	  <para>
 	    See next sections for format specific options.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>filename</option></term>
        <listitem>
@ -506,45 +527,6 @@ Import CSV data and Large Object to PostgreSQL
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>field_count</option></term>
        <listitem>
          <para>
 	    The <command>UNLOAD</command> command does not escape
 	    newlines when they appear into table data. Hence, you may
 	    obtain multi-line data files, where a single database row
 	    (say tuple if you prefer to) can span multiple physical
 	    lines into the unloaded file.
 	  </para>
 	  <para>
 	    If this is your case, you may want to configure here the
 	    number of columns per tuple. Then
 	    <command>pgloader</command> will count columns and
 	    buffer line input in order to re-assemble several physical
 	    lines into one data row when needed.
 	  </para>
 	  <para>
 	    This parameter is optionnal.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>trailing_sep</option></term>
        <listitem>
          <para>
 	    If this option is set to <command>True</command>, the
 	    input data file is known to append a
 	    <command>field_sep</command> as the last character of each
 	    of its lines. With this option set, this last character is
 	    then not considered as a field separator.
 	  </para>
 	  <para>
 	    This parameter is optionnal and defaults to False.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>client_encoding</option></term>
        <listitem>
@ -591,16 +573,137 @@ Import CSV data and Large Object to PostgreSQL
      </varlistentry>
      <varlistentry>
-        <term><option>partial_copy</option></term>
+        <term><option>index</option></term>
        <listitem>
          <para>
-	    If your columns definition does not contain all of the
+	    Table index definition, to be used in blob UPDATE'ing. You
-	    PostgreSQL table definition, set this parameter to
+	    define an index column by giving its name and its column
-	    <command>True</command>.
+	    number (as found into your data file, and counting from 1)
 	    separated by a colon.  If your table has a composite key,
 	    then you can define multiple columns here, separated by a
 	    comma.
 	  </para>
 	  <para>
-	    This parameter is optionnal and defaults to
+	    index = colname:3, other_colname:5
-	    <command>False</command>.
+	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>columns</option></term>
        <listitem>
          <para>
 	    You can define here table columns, with the same
 	    definition format as in previous <command>index</command>
 	    parameter.
 	  </para>
 	  <para>
 	    Note you'll have to define here all the columns to be
 	    found in data file, whether you want to use them all or
 	    not. When not using them all, use the
 	    <command>only_cols</command> parameter to restrict.
 	  </para>
 	  <para>
 	    As of <command>pgloader 2.2</command> the column list used
 	    might not be the same as the table columns definition.
 	  </para>
 	  <para>
 	    In case you have a lot a columns per table, you will want
 	    to use multiple lines for this parameter value. Python
 	    <command>ConfigParser</command> module knows how to read
 	    multi-line parameters, you don't have to escape anything.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>only_cols</option></term>
        <listitem>
          <para>
 	    If you want to only load a part of the columns you have
 	    into the data file, this option let you define which
 	    columns you're interrested in. <command>only_col</command>
 	    is a comma separated list of ranges or values, as in
 	    following example.
 	  </para>
          <para>
 	    only_cols = 1-3, 5
 	  </para>
          <para>
 	    This parameter is optionnal and defaults to the list of
 	    all columns given on the <command>columns</command>
 	    parameter list, in the colname order.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>blob_columns</option></term>
        <listitem>
          <para>
 	    The definition of the colums where to find some blob or
 	    clob reference. This definition is composed by a table
 	    column name, a column number (couting from one) reference
 	    into the Informix <command>UNLOAD</command> data file, and
 	    a large object type, separated by a colon. You can have
 	    several columns in this field, separated by a
 	    comma.
 	  </para>
 	  <para>
 	    Supported large objects type are Informix blob and clob,
 	    the awaited configuration string are respectively
 	    <command>ifx_blob</command> for binary (bytea) content
 	    type and <command>ifx_clob</command> for text type values.
 	  </para>
 	  <para>
 	    Here's an example:
 	  </para>
 	  <para>
 	    blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
 	  </para>
        </listitem>
      </varlistentry>
    </variablelist>
  </refsect1>
  <refsect1>
    <title>TEXT FORMAT CONFIGURATION PARAMETERS</title>
    <variablelist>
      <varlistentry>
        <term><option>field_count</option></term>
        <listitem>
          <para>
 	    The <command>UNLOAD</command> command does not escape
 	    newlines when they appear into table data. Hence, you may
 	    obtain multi-line data files, where a single database row
 	    (say tuple if you prefer to) can span multiple physical
 	    lines into the unloaded file.
 	  </para>
 	  <para>
 	    If this is your case, you may want to configure here the
 	    number of columns per tuple. Then
 	    <command>pgloader</command> will count columns and
 	    buffer line input in order to re-assemble several physical
 	    lines into one data row when needed.
 	  </para>
 	  <para>
 	    This parameter is optionnal.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>trailing_sep</option></term>
        <listitem>
          <para>
 	    If this option is set to <command>True</command>, the
 	    input data file is known to append a
 	    <command>field_sep</command> as the last character of each
 	    of its lines. With this option set, this last character is
 	    then not considered as a field separator.
 	  </para>
 	  <para>
 	    This parameter is optionnal and defaults to False.
 	  </para>
        </listitem>
      </varlistentry>
@ -644,66 +747,59 @@ Import CSV data and Large Object to PostgreSQL
        </listitem>
      </varlistentry>
    </variablelist>
  </refsect1>
  <refsect1>
    <title>CSV FORMAT CONFIGURATION PARAMETERS</title>
    <variablelist>
      <varlistentry>
-        <term><option>index</option></term>
+        <term><option>doublequote</option></term>
        <listitem>
 	  <para>
-	    Table index definition, to be used in blob UPDATE'ing. You
+	    Controls how instances of quotechar appearing inside a
-	    define an index column by giving its name and its column
+	    field should be themselves be quoted. When True, the
-	    number (as found into your data file, and counting from 1)
+	    character is doubled. When False, the escapechar is used
-	    separated by a colon.  If your table has a composite key,
+	    as a prefix to the quotechar. It defaults to True.
 	    then you can define multiple columns here, separated by a
 	    comma.
 	  </para>
 	  <para>
 	    index = colname:3, other_colname:5
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
-        <term><option>columns</option></term>
+        <term><option>escapechar</option></term>
        <listitem>
          <para>
-	    You can define here table columns, with the same
+	    A one-character string used by the writer to escape the
-	    definition format as in previous <command>index</command>
+	    delimiter if quoting is set to QUOTE_NONE and the
-	    parameter.
+	    quotechar if doublequote is False. On reading, the
-	  </para>
+	    escapechar removes any special meaning from the following
-	  <para>
+	    character. It defaults to None, which disables escaping.
 	    In case you have a lot a columns per table, you will want
 	    to use ultiple lines for this parameter value. Python
 	    <command>ConfigParser</command> module knows how to read
 	    multi-line parameters, you don't have to escape anything.
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
-        <term><option>blob_columns</option></term>
+        <term><option>quotechar</option></term>
        <listitem>
          <para>
-	    The definition of the colums where to find some blob or
+	    A one-character string used to quote fields containing
-	    clob reference. This definition is composed by a table
+	    special characters, such as the delimiter or quotechar, or
-	    column name, a column number (couting from one) reference
+	    which contain new-line characters. It defaults to '"'.
 	    into the Informix <command>UNLOAD</command> data file, and
 	    a large object type, separated by a colon. You can have
 	    several columns in this field, separated by a
 	    comma.
 	  </para>
 	  <para>
 	    Supported large objects type are Informix blob and clob,
 	    the awaited configuration string are respectively
 	    <command>ifx_blob</command> for binary (bytea) content
 	    type and <command>ifx_clob</command> for text type values.
 	  </para>
 	  <para>
 	    Here's an example:
 	  </para>
 	  <para>
 	    blob_type = clob_column:3:ifx_blob, other_clob_column:5:ifx_clob
 	  </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>skipinitialspace</option></term>
        <listitem>
          <para>
 	    When True, whitespace immediately following the delimiter
 	    is ignored. The default is False.
 	  </para>
        </listitem>
      </varlistentry>
    </variablelist>
  </refsect1>
@ -737,7 +833,7 @@ Import CSV data and Large Object to PostgreSQL
  <refsect1>
    <title>BUGS</title>
    <para>
-      Please report bugs to Dimitri Fontaine &lt;dim@dalibo.com&gt;.
+      Please report bugs to Dimitri Fontaine &lt;dim@tapoueh.org&gt;.
    </para>
    <para>
      When last line is alone on a <command>COPY</command> command and its
@ -750,7 +846,7 @@ Import CSV data and Large Object to PostgreSQL
    <title>AUTHORS</title>
    <para>
      <command>pgloader</command> is written by <author>Dimitri
-      Fontaine</author> <email>dim@dalibo.com</email>.
+      Fontaine</author> <email>dim@tapoueh.org</email>.
    </para>
  </refsect1>
--- a/pgloader/csvreader.py
+++ b/pgloader/csvreader.py
@ -0,0 +1,83 @@
 # -*- coding: ISO-8859-15 -*-
 # Author: Dimitri Fontaine <dimitri@dalibo.com>
 #
 # pgloader text format reader
 #
 # handles configuration, parse data, then pass them to database module for
 # COPY preparation
 import os, sys, os.path, time, codecs, csv
 from cStringIO import StringIO
 from tools    import PGLoader_Error, Reject, parse_config_string
 from db       import db
 from lo       import ifx_clob, ifx_blob
 from reader   import DataReader
 from options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC
 from options import TRUNCATE, VACUUM
 from options import COUNT, FROM_COUNT, FROM_ID
 from options import INPUT_ENCODING, PG_CLIENT_ENCODING
 from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
 from options import NEWLINE_ESCAPES
 class CSVReader(DataReader):
    """
    Read some CSV formatted data
    """
    def readconfig(self, name, config):
        """ get this reader module configuration from config file """
        DataReader.readconfig(self, name, config)
        # optionnal doublequote: defaults to escaping, not doubling
        self.doublequote = False
        if config.has_option(name, 'doublequote'):
            self.trailing_sep = config.get(name, 'doublequote') == 'True'
        self.escapechar = None
        if config.has_option(name, 'escapechar'):
            self.escapechar = config.get(name, 'escapechar')[0]
        self.quotechar = '"'
        if config.has_option(name, 'quotechar'):
            self.quotechar = config.get(name, 'quotechar')[0]
        self.skipinitialspace = False
        if config.has_option(name, 'skipinitialspace'):
            self.skipinitialspace = config.get(name, 'skipinitialspace') == 'True'
    def readlines(self):
        """ read data from configured file, and generate (yields) for
        each data line: line, columns and rowid """
        # make a dialect, then implement a reader with it
        class pgloader_dialect(csv.Dialect):
            delimiter        = self.field_sep
            doublequote      = self.doublequote
            escapechar       = self.escapechar
            quotechar        = self.quotechar
            skipinitialspace = self.skipinitialspace
            lineterminator   = '\r\n'
            quoting          = csv.QUOTE_MINIMAL
        csv.register_dialect('pgloader', pgloader_dialect)
        if INPUT_ENCODING is not None:
            try:
                fd = codecs.open(self.filename, encoding = INPUT_ENCODING)
            except LookupError, e:
                # codec not found
                raise PGLoader_Error, "Input codec: %s" % e
        else:
            try:
                fd = open(self.filename, "rb")
            except IOError, error:
                raise PGLoader_Error, error
        # now read the lines
        for columns in csv.reader(fd, dialect = 'pgloader'):
            line = self.field_sep.join(columns)
            yield line, columns
--- a/pgloader/db.py
+++ b/pgloader/db.py
@ -257,16 +257,18 @@ class db:
        print "--- COPY data buffer saved in %s ---" % n
        return n
-    def copy_from(self, table, partial_coldef, columns, input_line,
+    def copy_from(self, table, table_colspec, columns, input_line,
                  reject, EOF = False):
        """ Generate some COPY SQL for PostgreSQL """
        ok = True
        if not self.copy: self.copy = True
-        if partial_coldef is not None:
+        ##
-            # we prefer not having to mess table param on the caller side
+        # build the table colomns specs from parameters
-            # as it's an implementation detail concerning db class
+        # ie. we always issue COPY table (col1, col2, ..., coln) commands
-            table = "%s (%s) " % (table, partial_coldef)
+        table = "%s (%s) " % (table, ", ".join(table_colspec))
        if DEBUG:
            print 'COPY %s' % table
        if EOF or self.running_commands == self.copy_every \
               and self.buffer is not None:
--- a/pgloader/pgloader.py
+++ b/pgloader/pgloader.py
@ -49,14 +49,6 @@ class PGLoader:
            print
            print "[%s] parse configuration" % self.name
        # some configuration elements don't have default value
        for opt in ('table', 'filename'):
            if  config.has_option(name, opt):
                self.__dict__[opt] = config.get(name, opt)
            else:
                print 'Error: please configure %s.%s' % (name, opt)
                self.config_errors += 1
        ##
        # reject log and data files defaults to /tmp/<section>.rej[.log]
        if config.has_option(name, 'reject_log'):
@ -76,48 +68,24 @@ class PGLoader:
        # reject logging
        self.reject = Reject(self.reject_log, self.reject_data)
        # optionnal number of columns per line
        self.field_count = None
        if config.has_option(name, 'field_count'):
            self.field_count = config.getint(name, 'field_count')
        # optionnal field separator
        self.field_sep = FIELD_SEP
        if config.has_option(name, 'field_sep'):
            self.field_sep = config.get(name, 'field_sep')
            if not DRY_RUN:
                if self.db.copy_sep is None:
                    self.db.copy_sep = self.field_sep
        # optionnal trailing separator option
        self.trailing_sep = False
        if config.has_option(name, 'trailing_sep'):
            self.trailing_sep = config.get(name, 'trailing_sep') == 'True'
        # optionnal null and empty_string per table parameters
        if config.has_option(name, 'null'):
            self.db.null = parse_config_string(config.get(name, 'null'))
        else:
            self.db.null = NULL
        if config.has_option(name, 'empty_string'):
            self.db.empty_string = parse_config_string(
                config.get(name, 'empty_string'))
        else:
            self.db.empty_string = EMPTY_STRING
        # optionnal local option client_encoding
        if config.has_option(name, 'client_encoding'):
            self.db.client_encoding = parse_config_string(
                config.get(name, 'client_encoding'))
        if DEBUG:
            print "null: '%s'" % self.db.null
            print "empty_string: '%s'" %  self.db.empty_string
            print "client_encoding: '%s'" % self.db.client_encoding
        ##
        # data filename
        for opt in ('table', 'filename'):
            if  config.has_option(name, opt):
                self.__dict__[opt] = config.get(name, opt)
            else:
                print 'Error: please configure %s.%s' % (name, opt)
                self.config_errors += 1
        ##
        # we parse some columns definitions
        if config.has_option(name, 'index'):
@ -137,34 +105,74 @@ class PGLoader:
            print 'blob_columns', self.blob_cols
        ##
        # We have for example columns = col1:2, col2:1
        # this means the order of input columns is not the same as the
        # awaited order of COPY, so we want a mapping index, here [2, 1]
        if self.columns is not None:
            self.col_mapping = [i for (c, i) in self.columns]
        ##
        # optionnal partial loading option (sequences case)
-        self.partial_copy   = False
+        # self.table_colspec is the column list to give to
-        self.partial_coldef = None
+        # COPY table(...) command, either the cols given in
        # the only_cols config, or the columns directly
        self.only_cols = None
        self.table_colspec = [n for (n, pos) in self.columns]
-        if config.has_option(name, 'partial_copy'):
+        if config.has_option(name, 'only_cols'):
-            self.partial_copy = config.get(name, 'partial_copy') == 'True'
+            self.only_cols = config.get(name, 'only_cols')
-            if self.partial_copy:
+            ##
-                self.partial_coldef = [name for (name, pos) in self.columns]
+            # first make an index list out of configuration
            # which contains coma separated ranges or values
            # as for example: only_cols = 1-3, 5
            try:
                only_cols = [x.strip() for x in self.only_cols.split(",")]
                expanded  = []
-        # optionnal newline escaped option
+                # expand ranges
-        self.newline_escapes = []
+                for oc in only_cols:
-        if config.has_option(name, 'newline_escapes'):
+                    if '-' in oc:
-            if NEWLINE_ESCAPES is not None:
+                        (a, b) = [int(x) for x in oc.split("-")]
-                # this parameter is globally set, will ignore local
+                        for i in range(a, b+1):
-                # definition
+                            expanded.append(i)
                print "Warning: ignoring %s newline_escapes option" % name
                print "         option is set to '%s' globally" \
                      % NEWLINE_ESCAPES
                    else:
-                self._parse_fields('newline_escapes',
+                        expanded.append(int(oc))
                                   config.get(name, 'newline_escapes'),
                                   argtype = 'char')
-        if NEWLINE_ESCAPES is not None:
+                self.only_cols     = expanded
-            # set NEWLINE_ESCAPES for each table column
+                self.table_colspec = [self.columns[x-1][0] for x in expanded]
-            self.newline_escapes = [(a, NEWLINE_ESCAPES)
+
-                                    for (a, x) in self.columns]
+            except Exception, e:
                print 'Error: section %s, only_cols: configured range is invalid' % name
                raise PGLoader_Error, e
        if DEBUG:
            print "only_cols", self.only_cols
            print "table_colspec", self.table_colspec
        ##
        # data format, from which depend data reader
        self.format = None
        if config.has_option(name, 'format'):
            self.format = config.get(name, 'format')
            if self.format.lower() == 'csv':
                from csvreader import CSVReader 
                self.reader = CSVReader(self.db, self.filename, self.table, self.columns)
            elif self.format.lower() == 'text':
                from textreader import TextReader
                self.reader = TextReader(self.db, self.filename, self.table, self.columns)
        if self.format is None:
            print 'Error: %s: format parameter needed' % name
            raise PGLoader_Error
        ##
        # parse the reader specific section options
        self.reader.readconfig(name, config)
        ##
        # How can we mix those columns definitions ?
@ -187,12 +195,6 @@ class PGLoader:
                "for blob importing (blob_cols), please configure index")
            self.config_errors += 1
        ##
        # We have for example columns = col1:2, col2:1
        # this means the order of input columns is not the same as the
        # awaited order of COPY, so we want a mapping index, here [2, 1]
        if self.columns is not None:
            self.col_mapping = [i for (c, i) in self.columns]
        ##
        # if options.fromid is not None it has to be either a value,
@ -268,54 +270,6 @@ class PGLoader:
            # FIXME: make some errors and write some error messages
            raise
    def _split_line(self, line):
        """ split given line and returns a columns list """
        last_sep = 0
        columns  = []
        pos      = line.find(self.field_sep, last_sep)
        while pos != -1:
            # don't consider backslash escaped separators melted into data
            # warning, we may find some data|\\|data
            # that is some escaped \ then a legal | separator
            i=1
            while pos-i >= 0 and line[pos-i] == '\\':
                i += 1
            # now i-1 is the number of \ preceding the separator
            # it's a legal separator only if i-1 is even
            if (i-1) % 2 == 0:
                # there's no need to keep escaped delimiters inside a column
                # and we want to avoid double-escaping them
                columns.append(line[last_sep:pos]
                               .replace("\\%s" % self.field_sep,
                                        self.field_sep))
                last_sep = pos + 1
            pos = line.find(self.field_sep, pos + 1)
        # append last column
        columns.append(line[last_sep:])
        return columns
    def _rowids(self, columns):
        """ get rowids for given input line """
        rowids = {}
        try:
            for id_name, id_col in self.index:
                rowids[id_name] = columns[id_col - 1]
        except IndexError, e:
            messages = [
                "Warning: couldn't get id %d on column #%d" % (id_name,
                                                               id_col), 
                str(e)
                ]
            self.reject.log(messages, line)
        return rowids
    def summary(self):
        """ return a (duration, updates, errors) tuple """
        self.duration = time.time() - self.init_time
@ -351,7 +305,7 @@ class PGLoader:
        if self.columns is not None:
            print "Notice: COPY csv data"
-            self.csv_import()
+            self.data_import()
        elif self.blob_cols is not None:
            # elif: COPY process also blob data
@ -360,18 +314,9 @@ class PGLoader:
        # then show up some stats
        self.print_stats()
-    def csv_import(self):
+    def data_import(self):
-        """ import CSV data, using COPY """
+        """ import CSV or TEXT data, using COPY """
-
+        for line, columns in self.reader.readlines():
        ##
        # Inform database about optionnal partial columns definition
        # usage for COPY (sequences case, e.g.)
        if self.partial_coldef is not None:
            partial_copy_coldef = ", ".join(self.partial_coldef)
        else:
            partial_copy_coldef = None
        for line, columns in self.read_data():
            if self.blob_cols is not None:
                columns, rowids = self.read_blob(line, columns)
@ -379,34 +324,36 @@ class PGLoader:
                print self.col_mapping
                print len(columns), len(self.col_mapping)
            if False and VERBOSE:
                print line
                for i in [37, 44, 52, 38]:
                    print len(columns[i-1]), columns[i-1]
                print
            ##
-            # Now we have to reorder the columns to match schema
+            # Now we have to reorder the columns to match schema, and only
            # consider data matched by self.only_cols
            if self.only_cols is not None:
                c_ordered = [columns[self.col_mapping[i-1]-1] for i in self.only_cols]
            else:
                c_ordered = [columns[i-1] for i in self.col_mapping]
            if DRY_RUN or DEBUG:
                print line
                print c_ordered
                print len(c_ordered)
-                print self.db.partial_coldef
+                print self.table_colspec
                print
            if not DRY_RUN:
-                self.db.copy_from(self.table, partial_copy_coldef,
+                self.db.copy_from(self.table, self.table_colspec,
                                  c_ordered, line, self.reject)
        if not DRY_RUN:
            # we may need a last COPY for the rest of data
-            self.db.copy_from(self.table, partial_copy_coldef,
+            self.db.copy_from(self.table, self.table_colspec,
                              None, None, self.reject, EOF = True)
        return
    ##
    # BLOB data reading/parsing
    # probably should be moved out from this file
    def lo_import(self):
        """ import large object data, using UPDATEs """
@ -426,223 +373,6 @@ class PGLoader:
                                    rowids, cname, data, btype,
                                    line, self.reject)
    def _chomp(self, input_line):
        """ chomp end of line when necessary, and trailing_sep too """
        if len(input_line) == 0:
            if DEBUG:
                print 'pgloader._chomp: skipping empty line'
            return input_line
        # chomp a copy of the input_line, we will need the original one
        line = input_line[:]
        if line[-2:] == "\r\n":
            line = line[:-2]
        elif line[-1] == "\r":
            line = line[:-1]
        elif line[-1] == "\n":
            line = line[:-1]
        # trailing separator to whipe out ?
        if self.trailing_sep \
            and line[-len(self.field_sep)] == self.field_sep:
            line = line[:-len(self.field_sep)]
        return line
    def _escape_newlines(self, columns):
        """ trim out newline escapes to be found inside data columns """
        if DEBUG:
            print 'Debug: escaping columns newlines'
            print 'Debug:', self.newline_escapes
        for (ne_col, ne_esc) in self.newline_escapes:
            # don't forget configured col references use counting from 1
            ne_colnum = dict(self.columns)[ne_col] - 1
            if DEBUG:
                print 'Debug: column %s[%d] escaped with %s' \
                      % (ne_col, ne_colnum+1, ne_esc)
            col_data = columns[ne_colnum]
            if self.db.is_null(col_data) or self.db.is_empty(col_data):
                if DEBUG:
                    print 'Debug: skipping null or empty column'
                continue
            escaped   = []
            tmp       = col_data
            for line in tmp.split('\n'):
                if len(line) == 0:
                    if DEBUG:
                        print 'Debug: skipping empty line'
                    continue
                if DEBUG:
                    print 'Debug: chomping:', line
                tmpline = self._chomp(line)
                if tmpline[-1] == ne_esc:
                    tmpline = tmpline[:-1]
                    # chomp out only escaping char, not newline itself
                    escaped.append(line[:len(tmpline)] + \
                                   line[len(tmpline)+1:])
                else:
                    # line does not end with escaping char, keep it
                    escaped.append(line)
            columns[ne_colnum] = '\n'.join(escaped)
        return columns
    def read_data(self):
        """ read data from configured file, and generate (yields) for
        each data line: line, columns and rowid """
        # temporary feature for controlling when to begin real inserts
        # if first time launch, set to True.
        input_buffer = StringIO()
        nb_lines     = 0
        begin_linenb = None
        nb_plines    = 0
        ##
        # if neither -I nor -F was used, we can state that begin = 0
        if FROM_ID is None and FROM_COUNT == 0:
            if VERBOSE:
                print 'Notice: beginning on first line'
            begin_linenb = 1
        if INPUT_ENCODING is not None:
            try:
                fd = codecs.open(self.filename, encoding = INPUT_ENCODING)
            except LookupError, e:
                # codec not found
                raise PGLoader_Error, "Input codec: %s" % e
        else:
            try:
                fd = open(self.filename)
            except IOError, error:
                raise PGLoader_Error, error
        for line in fd:
            # we count real physical lines
            nb_plines += 1
            if INPUT_ENCODING is not None:
                # this may not be necessary, after all
                try:
                    line = line.encode(INPUT_ENCODING)
                except UnicodeDecodeError, e:
                    reject.log(['Codec error', str(e)], input_line)
                    continue
            if self.field_count is not None:
                input_buffer.write(line)
                # act as if this were the last input_buffer for this line
                tmp     = self._chomp(input_buffer.getvalue())
                columns = self._split_line(tmp)
                nb_cols = len(columns)
                # check we got them all if not and field_count was
                # given, we have a multi-line input
                if nb_cols < self.field_count:
                    continue
                else:
                    # we have read all the logical line
                    line = tmp
                    input_buffer.close()
                    input_buffer = StringIO()
                    if nb_cols != self.field_count:
                        if DEBUG:
                            print line
                            print columns
                            print
                        self.reject.log(
                            'Error parsing columns on line ' +\
                            '%d [row %d]: found %d columns' \
                            % (nb_plines, nb_lines, nb_cols), line)
            else:
                # normal operation mode : one physical line is one
                # logical line. we didn't split input line yet
                line    = self._chomp(line)
                nb_cols = None
                columns = None
            if len(line) == 0:
                # skip empty lines
                continue
            # we count logical lines
            nb_lines += 1
            ##
            # if -F is used, count lines to skip, and skip them
            if FROM_COUNT > 0:
                if nb_lines < FROM_COUNT:
                    continue
                if nb_lines == FROM_COUNT:
                    begin_linenb = nb_lines
                    if VERBOSE:
                        print 'Notice: reached beginning on line %d' % nb_lines
            ##
            # check for beginning if option -I was used
            if FROM_ID is not None:
                if columns is None:
                    columns = self._split_line(line)
                rowids = self._rowids(columns)
                if FROM_ID == rowids:
                    begin_linenb = nb_lines
                    if VERBOSE:
                        print 'Notice: reached beginning on line %d' % nb_lines
                elif begin_linenb is None:
                    # begin is set to 1 when we don't use neither -I nor -F
                    continue
            if COUNT is not None and begin_linenb is not None \
               and (nb_lines - begin_linenb + 1) > COUNT:
                if VERBOSE:
                    print 'Notice: reached line %d, stopping' % nb_lines
                break
            if columns is None:
                columns = self._split_line(line)
            if DEBUG:
                print 'Debug: read data'
            # now, we may have to apply newline_escapes on configured columns
            if NEWLINE_ESCAPES or self.newline_escapes != []:
                columns = self._escape_newlines(columns)
            nb_cols = len(columns)
            if nb_cols != len(self.columns):
                if DEBUG:
                    print line
                    print columns
                    print
                msg = 'Error parsing columns on line ' +\
                      '%d [row %d]: found %d columns' \
                      % (nb_plines, nb_lines, nb_cols)
                self.reject.log(msg, line)
                continue
            yield line, columns
    def read_blob(self, line, columns):
@ -727,3 +457,21 @@ class PGLoader:
        return columns, rowids
    def _rowids(self, columns):
        """ get rowids for given input line """
        rowids = {}
        try:
            for id_name, id_col in self.index:
                rowids[id_name] = columns[id_col - 1]
        except IndexError, e:
            messages = [
                "Warning: couldn't get id %d on column #%d" % (id_name,
                                                               id_col), 
                str(e)
                ]
            self.reject.log(messages, line)
        return rowids
--- a/pgloader/reader.py
+++ b/pgloader/reader.py
@ -0,0 +1,70 @@
 # -*- coding: ISO-8859-15 -*-
 # Author: Dimitri Fontaine <dim@tapoueh.org>
 #
 # pgloader data reader interface and defaults
 from tools    import PGLoader_Error, Reject, parse_config_string
 from db       import db
 from lo       import ifx_clob, ifx_blob
 from options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC
 from options import TRUNCATE, VACUUM
 from options import COUNT, FROM_COUNT, FROM_ID
 from options import INPUT_ENCODING, PG_CLIENT_ENCODING
 from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
 from options import NEWLINE_ESCAPES
 class DataReader:
    """
    Read some text formatted data, which look like CSV but are not:
     - no quoting support
     - multi-line support is explicit (via 
    """
    def __init__(self, db, filename, table, columns):
        """ init internal variables """
        self.db        = db
        self.filename  = filename
        self.table     = table
        self.columns   = columns
    def readconfig(self, name, config):
        """ read configuration section for common options
        name is configuration section name, conf the ConfigParser object
        specific option reading code is to be found on subclasses
        which implements read data parsing code.
        see textreader.py and csvreader.py
        """
        # optionnal null and empty_string per table parameters
        if config.has_option(name, 'null'):
            self.db.null = parse_config_string(config.get(name, 'null'))
        else:
            self.db.null = NULL
        if config.has_option(name, 'empty_string'):
            self.db.empty_string = parse_config_string(
                config.get(name, 'empty_string'))
        else:
            self.db.empty_string = EMPTY_STRING
        # optionnal field separator
        self.field_sep = FIELD_SEP
        if config.has_option(name, 'field_sep'):
            self.field_sep = config.get(name, 'field_sep')
            if not DRY_RUN:
                if self.db.copy_sep is None:
                    self.db.copy_sep = self.field_sep
        if DEBUG:
            print "null: '%s'" % self.db.null
            print "empty_string: '%s'" %  self.db.empty_string
    def readlines(self):
        """ read data from configured file, and generate (yields) for
        each data line: line, columns and rowid """
        pass
--- a/pgloader/textreader.py
+++ b/pgloader/textreader.py
@ -0,0 +1,318 @@
 # -*- coding: ISO-8859-15 -*-
 # Author: Dimitri Fontaine <dimitri@dalibo.com>
 #
 # pgloader text format reader
 #
 # handles configuration, parse data, then pass them to database module for
 # COPY preparation
 import os, sys, os.path, time, codecs
 from cStringIO import StringIO
 from tools    import PGLoader_Error, Reject, parse_config_string
 from db       import db
 from lo       import ifx_clob, ifx_blob
 from reader   import DataReader
 from options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC
 from options import TRUNCATE, VACUUM
 from options import COUNT, FROM_COUNT, FROM_ID
 from options import INPUT_ENCODING, PG_CLIENT_ENCODING
 from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
 from options import NEWLINE_ESCAPES
 class TextReader(DataReader):
    """
    Read some text formatted data, which look like CSV but are not:
     - no quoting support
     - trailing separator trailing support
     - multi-line support is explicit (via field_count parameter)
     - newline escaping in multi-line content support
     - ...
    """
    def readconfig(self, name, config):
        """ get this reader module configuration from config file """
        DataReader.readconfig(self, name, config)
        # optionnal number of columns per line
        self.field_count = None
        if config.has_option(name, 'field_count'):
            self.field_count = config.getint(name, 'field_count')
        # optionnal trailing separator option
        self.trailing_sep = False
        if config.has_option(name, 'trailing_sep'):
            self.trailing_sep = config.get(name, 'trailing_sep') == 'True'
        # optionnal newline escaped option
        self.newline_escapes = []
        if config.has_option(name, 'newline_escapes'):
            if NEWLINE_ESCAPES is not None:
                # this parameter is globally set, will ignore local
                # definition
                print "Warning: ignoring %s newline_escapes option" % name
                print "         option is set to '%s' globally" \
                      % NEWLINE_ESCAPES
            else:
                self._parse_fields('newline_escapes',
                                   config.get(name, 'newline_escapes'),
                                   argtype = 'char')
        if NEWLINE_ESCAPES is not None:
            # set NEWLINE_ESCAPES for each table column
            self.newline_escapes = [(a, NEWLINE_ESCAPES)
                                    for (a, x) in self.columns]
    def readlines(self):
        """ read data from configured file, and generate (yields) for
        each data line: line, columns and rowid """
        # temporary feature for controlling when to begin real inserts
        # if first time launch, set to True.
        input_buffer = StringIO()
        nb_lines     = 0
        begin_linenb = None
        nb_plines    = 0
        ##
        # if neither -I nor -F was used, we can state that begin = 0
        if FROM_ID is None and FROM_COUNT == 0:
            if VERBOSE:
                print 'Notice: beginning on first line'
            begin_linenb = 1
        if INPUT_ENCODING is not None:
            try:
                fd = codecs.open(self.filename, encoding = INPUT_ENCODING)
            except LookupError, e:
                # codec not found
                raise PGLoader_Error, "Input codec: %s" % e
        else:
            try:
                fd = open(self.filename)
            except IOError, error:
                raise PGLoader_Error, error
        for line in fd:
            # we count real physical lines
            nb_plines += 1
            if INPUT_ENCODING is not None:
                # this may not be necessary, after all
                try:
                    line = line.encode(INPUT_ENCODING)
                except UnicodeDecodeError, e:
                    reject.log(['Codec error', str(e)], input_line)
                    continue
            if self.field_count is not None:
                input_buffer.write(line)
                # act as if this were the last input_buffer for this line
                tmp     = self._chomp(input_buffer.getvalue())
                columns = self._split_line(tmp)
                nb_cols = len(columns)
                # check we got them all if not and field_count was
                # given, we have a multi-line input
                if nb_cols < self.field_count:
                    continue
                else:
                    # we have read all the logical line
                    line = tmp
                    input_buffer.close()
                    input_buffer = StringIO()
                    if nb_cols != self.field_count:
                        if DEBUG:
                            print line
                            print columns
                            print
                        self.reject.log(
                            'Error parsing columns on line ' +\
                            '%d [row %d]: found %d columns' \
                            % (nb_plines, nb_lines, nb_cols), line)
            else:
                # normal operation mode : one physical line is one
                # logical line. we didn't split input line yet
                line    = self._chomp(line)
                nb_cols = None
                columns = None
            if len(line) == 0:
                # skip empty lines
                continue
            # we count logical lines
            nb_lines += 1
            ##
            # if -F is used, count lines to skip, and skip them
            if FROM_COUNT > 0:
                if nb_lines < FROM_COUNT:
                    continue
                if nb_lines == FROM_COUNT:
                    begin_linenb = nb_lines
                    if VERBOSE:
                        print 'Notice: reached beginning on line %d' % nb_lines
            ##
            # check for beginning if option -I was used
            if FROM_ID is not None:
                if columns is None:
                    columns = self._split_line(line)
                rowids = self._rowids(columns)
                if FROM_ID == rowids:
                    begin_linenb = nb_lines
                    if VERBOSE:
                        print 'Notice: reached beginning on line %d' % nb_lines
                elif begin_linenb is None:
                    # begin is set to 1 when we don't use neither -I nor -F
                    continue
            if COUNT is not None and begin_linenb is not None \
               and (nb_lines - begin_linenb + 1) > COUNT:
                if VERBOSE:
                    print 'Notice: reached line %d, stopping' % nb_lines
                break
            if columns is None:
                columns = self._split_line(line)
            if DEBUG:
                print 'Debug: read data'
            # now, we may have to apply newline_escapes on configured columns
            if NEWLINE_ESCAPES or self.newline_escapes != []:
                columns = self._escape_newlines(columns)
            nb_cols = len(columns)
            if nb_cols != len(self.columns):
                if DEBUG:
                    print line
                    print columns
                    print
                msg = 'Error parsing columns on line ' +\
                      '%d [row %d]: found %d columns' \
                      % (nb_plines, nb_lines, nb_cols)
                self.reject.log(msg, line)
                continue
            yield line, columns
    def _split_line(self, line):
        """ split given line and returns a columns list """
        last_sep = 0
        columns  = []
        pos      = line.find(self.field_sep, last_sep)
        while pos != -1:
            # don't consider backslash escaped separators melted into data
            # warning, we may find some data|\\|data
            # that is some escaped \ then a legal | separator
            i=1
            while pos-i >= 0 and line[pos-i] == '\\':
                i += 1
            # now i-1 is the number of \ preceding the separator
            # it's a legal separator only if i-1 is even
            if (i-1) % 2 == 0:
                # there's no need to keep escaped delimiters inside a column
                # and we want to avoid double-escaping them
                columns.append(line[last_sep:pos]
                               .replace("\\%s" % self.field_sep,
                                        self.field_sep))
                last_sep = pos + 1
            pos = line.find(self.field_sep, pos + 1)
        # append last column
        columns.append(line[last_sep:])
        return columns
    def _chomp(self, input_line):
        """ chomp end of line when necessary, and trailing_sep too """
        if len(input_line) == 0:
            if DEBUG:
                print 'pgloader._chomp: skipping empty line'
            return input_line
        # chomp a copy of the input_line, we will need the original one
        line = input_line[:]
        if line[-2:] == "\r\n":
            line = line[:-2]
        elif line[-1] == "\r":
            line = line[:-1]
        elif line[-1] == "\n":
            line = line[:-1]
        # trailing separator to whipe out ?
        if self.trailing_sep \
            and line[-len(self.field_sep)] == self.field_sep:
            line = line[:-len(self.field_sep)]
        return line
    def _escape_newlines(self, columns):
        """ trim out newline escapes to be found inside data columns """
        if DEBUG:
            print 'Debug: escaping columns newlines'
            print 'Debug:', self.newline_escapes
        for (ne_col, ne_esc) in self.newline_escapes:
            # don't forget configured col references use counting from 1
            ne_colnum = dict(self.columns)[ne_col] - 1
            if DEBUG:
                print 'Debug: column %s[%d] escaped with %s' \
                      % (ne_col, ne_colnum+1, ne_esc)
            col_data = columns[ne_colnum]
            if self.db.is_null(col_data) or self.db.is_empty(col_data):
                if DEBUG:
                    print 'Debug: skipping null or empty column'
                continue
            escaped   = []
            tmp       = col_data
            for line in tmp.split('\n'):
                if len(line) == 0:
                    if DEBUG:
                        print 'Debug: skipping empty line'
                    continue
                if DEBUG:
                    print 'Debug: chomping:', line
                tmpline = self._chomp(line)
                if tmpline[-1] == ne_esc:
                    tmpline = tmpline[:-1]
                    # chomp out only escaping char, not newline itself
                    escaped.append(line[:len(tmpline)] + \
                                   line[len(tmpline)+1:])
                else:
                    # line does not end with escaping char, keep it
                    escaped.append(line)
            columns[ne_colnum] = '\n'.join(escaped)
        return columns