From 090de905b25451713eb6f7bddf42dacd9925682f Mon Sep 17 00:00:00 2001
From: dim <dim>
Date: Tue, 10 Mar 2009 17:56:16 +0000
Subject: [PATCH] Implement skip_head_lines in configuration file, some more
 bugfixes

---
 TODO.txt                    |  3 +++
 debian/changelog            |  7 +++++--
 debian/control              |  2 +-
 examples/csv/csv.data       |  1 +
 examples/pgloader.conf      | 24 +++++++++++----------
 examples/simple/simple.data |  2 ++
 pgloader.1.txt              |  5 +++++
 pgloader.py                 |  7 ++++---
 pgloader/csvreader.py       | 30 +++-----------------------
 pgloader/db.py              | 10 ++++++---
 pgloader/fixedreader.py     | 23 +++-----------------
 pgloader/options.py         |  2 +-
 pgloader/reader.py          | 42 +++++++++++++++++++++++++++++++++++--
 pgloader/textreader.py      | 22 +++++++------------
 14 files changed, 95 insertions(+), 85 deletions(-)
diff --git a/TODO.txt b/TODO.txt
index 7cefaa9..250a42c 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -172,6 +172,9 @@ column. See +examples/fixed+.
 
 == Facilities ==
 
+Current status::
+  Partially implemented, +skip_head_lines+ is in CVS (2.3.2~dev1)
+
 Add options:
 
 +skip_head_lines+::
diff --git a/debian/changelog b/debian/changelog
index d259c57..a64b4b5 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,11 +1,14 @@
-pgloader (2.3.2-1) unstable; urgency=low
+pgloader (2.3.2~dev1-1) unstable; urgency=low
 
   * Use psycopg cursor.copy_expert() when avaiable (> 2.0.6)
   * FIX fixedreader: it now know about -C
   * FIX Round Robin Reader with respect to offsets in readlines()
-  * allow python 2.3 to run pgloader when it does not need collections.deque (no Round Robin Reader)
+  * support python 2.3 if not using RRR (not importing collections.deque)
   * change logger initialisation to support python 2.3
   * FIX bad usage of STDERR in the code
+  * Implement skip_head_lines option in configuration (superseded by -F)
+  * Do not sort() section list when it's been given on command line
+  * Catch InterfaceError when trying to close connection
 
  -- Dimitri Fontaine <dim@tapoueh.org>  Wed, 17 Sep 2008 17:53:53 +0200
 
diff --git a/debian/control b/debian/control
index 36f6f36..0ff852e 100644
--- a/debian/control
+++ b/debian/control
@@ -3,7 +3,7 @@ Section: misc
 Priority: extra
 Maintainer: Dimitri Fontaine <dim@tapoueh.org>
 Build-Depends: debhelper (>= 5), docbook-to-man (>= 2.0.0), python-support (>= 0.3), xmlto, asciidoc (>= 0.8.2)
-Standards-Version: 3.7.3
+Standards-Version: 3.8.0
 Homepage: http://pgloader.projects.postgresql.org
 
 Package: pgloader
diff --git a/examples/csv/csv.data b/examples/csv/csv.data
index d6e01b3..3b472b8 100644
--- a/examples/csv/csv.data
+++ b/examples/csv/csv.data
@@ -1,3 +1,4 @@
+Stupid useless header with a © sign
 "2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom"
 "3.0.0.0","4.17.135.31","50331648","68257567","US","United States"
 "4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada"
diff --git a/examples/pgloader.conf b/examples/pgloader.conf
index 4e31b5f..d70d798 100644
--- a/examples/pgloader.conf
+++ b/examples/pgloader.conf
@@ -29,10 +29,11 @@ field_sep    = |
 trailing_sep = True
 
 [simple]
-use_template = simple_tmpl
-table        = simple
-filename     = simple/simple.data
-columns      = a:1, b:3, c:2
+use_template    = simple_tmpl
+table           = simple
+filename        = simple/simple.data
+columns         = a:1, b:3, c:2
+skip_head_lines = 2
 
 # those reject settings are defaults one
 reject_log   = /tmp/simple.rej.log
@@ -114,11 +115,12 @@ fixed_specs     = a:0:10, b:10:8, c:18:8, d:26:17
 reformat        = c:pgtime:time
 
 [csv]
-table        = csv
-format       = csv
-filename     = csv/csv.data
-field_sep    = ,
-quotechar    = "
-columns      = x, y, a, b, d:6, c:5
-only_cols    = 3-6
+table           = csv
+format          = csv
+filename        = csv/csv.data
+field_sep       = ,
+quotechar       = "
+columns         = x, y, a, b, d:6, c:5
+only_cols       = 3-6
+skip_head_lines = 1
 
diff --git a/examples/simple/simple.data b/examples/simple/simple.data
index adfb1b1..537de6f 100644
--- a/examples/simple/simple.data
+++ b/examples/simple/simple.data
@@ -1,3 +1,5 @@
+This is a stupid useless header like you sometime find in CSV files
+id|data|date|
 1|some first row text|2006-11-11|
 2|some second row text|13/11/2006|
 3|some third row text|12-10-2006|
diff --git a/pgloader.1.txt b/pgloader.1.txt
index 94e2c20..121bc4a 100644
--- a/pgloader.1.txt
+++ b/pgloader.1.txt
@@ -425,6 +425,11 @@ This parameter is optional and defaults to '\ ' (that is backslash
 followed by space). If defined on a table level, this local value will
 overwrite the global one.
 
+skip_head_lines::
+
+  Skip the +n+ first lines of the given files (headers)
+
+
 //////////////////////////////////////////
 index::
 +
diff --git a/pgloader.py b/pgloader.py
index ed9653f..a0f3ed6 100644
--- a/pgloader.py
+++ b/pgloader.py
@@ -493,6 +493,10 @@ def load_data():
             if s != 'pgsql':
                 sections.append(s)
 
+        # we run through sorted section list, unless we got the section list
+        # from command line
+        sections.sort()
+
     log.info('Will consider following sections:')
     for line in myprint(sections):
         log.info(line)
@@ -500,9 +504,6 @@ def load_data():
     # we count time passed from now on
     begin = time.time()
 
-    # we run through sorted section list
-    sections.sort()
-
     threads  = {}
     started  = {}
     finished = {}
diff --git a/pgloader/csvreader.py b/pgloader/csvreader.py
index ecd71c6..42f692c 100644
--- a/pgloader/csvreader.py
+++ b/pgloader/csvreader.py
@@ -70,43 +70,19 @@ class CSVReader(DataReader):
         self.fd = UnbufferedFileReader(self.filename, self.log,
                                        encoding = self.input_encoding,
                                        start    = self.start,
-                                       end      = self.end)
+                                       end      = self.end,
+                                       skip_head_lines = self.skip_head_lines)
         
         # don't forget COUNT and FROM_COUNT option in CSV mode
-        nb_lines     = 0
+        nb_lines     = self.skip_head_lines
         begin_linenb = None
         last_line_nb = 1
 
-        ##
-        # if -F was not used, we can state that begin = 0
-        #
-        # warning: FROM_ID is ignored
-        if FROM_COUNT == 0:
-            self.log.debug('beginning on first line')
-            begin_linenb = 1
-        
         # now read the lines
         for columns in csv.reader(self.fd, dialect = 'pgloader'):
             # we count logical lines
             nb_lines += 1
 
-            ##
-            # if -F is used, count lines to skip, and skip them
-            if FROM_COUNT > 0:
-                if nb_lines < FROM_COUNT:
-                    continue
-
-                if nb_lines == FROM_COUNT:
-                    begin_linenb = nb_lines
-                    self.log.info('reached beginning on line %d', nb_lines)
-
-            # check if we already processed COUNT lines
-            if COUNT is not None and begin_linenb is not None \
-               and (nb_lines - begin_linenb + 1) > COUNT:
-                
-                self.log.info('reached line %d, stopping', nb_lines)
-                return
-                    
             line         = self.field_sep.join(columns)
             offsets      = range(last_line_nb, self.fd.line_nb)
             last_line_nb = self.fd.line_nb
diff --git a/pgloader/db.py b/pgloader/db.py
index 8e9fb09..1b7421c 100644
--- a/pgloader/db.py
+++ b/pgloader/db.py
@@ -97,10 +97,14 @@ class db:
             try:
                 self.log.info('closing current database connection')
             except IOError, e:
-                # Ignore no space left on device etc here
+                # Ignore no space left on device...
+                pass
+
+            try:
+                self.dbconn.close()
+            except InterfaceError, e:
+                # Ignore connection already closed
                 pass
-            
-            self.dbconn.close()
             self.dbconn = None
 
     def set_encoding(self):
diff --git a/pgloader/fixedreader.py b/pgloader/fixedreader.py
index 8356682..2a28a94 100644
--- a/pgloader/fixedreader.py
+++ b/pgloader/fixedreader.py
@@ -63,20 +63,13 @@ class FixedReader(DataReader):
         self.fd = UnbufferedFileReader(self.filename, self.log,
                                        encoding = self.input_encoding,
                                        start    = self.start,
-                                       end      = self.end)
+                                       end      = self.end,
+                                       skip_head_lines = self.skip_head_lines)
 
         # don't forget COUNT and FROM_COUNT option
-        nb_lines     = 0
+        nb_lines     = self.skip_head_lines
         begin_linenb = None
 
-        ##
-        # if -F was not used, we can state that begin = 0
-        #
-        # warning: FROM_ID is ignored
-        if FROM_COUNT == 0:
-            self.log.debug('beginning on first line')
-            begin_linenb = 1
-            
         for line in self.fd:
             line      = line.strip("\n")
             llen      = len(line)
@@ -84,16 +77,6 @@ class FixedReader(DataReader):
             offsets   = [self.fd.line_nb]
             nb_lines += 1
 
-            ##
-            # if -F is used, count lines to skip, and skip them
-            if FROM_COUNT > 0:
-                if nb_lines < FROM_COUNT:
-                    continue
-
-                if nb_lines == FROM_COUNT:
-                    begin_linenb = nb_lines
-                    self.log.info('reached beginning on line %d', nb_lines)
-
             for cname, cpos in self.columns:
                 start, length = self.positions[cname]
 
diff --git a/pgloader/options.py b/pgloader/options.py
index 62193a4..3e8202e 100644
--- a/pgloader/options.py
+++ b/pgloader/options.py
@@ -2,7 +2,7 @@
 #
 # Some common options, for each module to get them
 
-PGLOADER_VERSION = '2.3.2'
+PGLOADER_VERSION = '2.3.2~dev1'
 
 PSYCOPG_VERSION = None
 
diff --git a/pgloader/reader.py b/pgloader/reader.py
index 4eff4bb..4b8bcaf 100644
--- a/pgloader/reader.py
+++ b/pgloader/reader.py
@@ -67,6 +67,13 @@ class DataReader:
         self._getopt('field_sep', config, name, template, FIELD_SEP)
         self.field_sep = self.field_sep.decode('string-escape')
 
+        ##
+        # FROM_COUNT takes precedence over skip_head_lines
+        if FROM_COUNT is None or FROM_COUNT == 0:
+            self._getopt('skip_head_lines', config, name, template, 0, 'int')
+        else:
+            self.skip_head_lines = FROM_COUNT - 1
+
         if len(self.field_sep) != 1:
             raise PGLoader_Error, "field_sep must be 1 char, not %d (%s)" \
                   % (len(self.field_sep), self.field_sep)
@@ -82,6 +89,8 @@ class DataReader:
             self.log.debug("reader.db %s copy_sep %s" % (self.db, self.db.copy_sep))
             
         self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep)
+        self.log.debug("reader.readconfig skip_head_lines: %d",
+                       self.skip_head_lines)
 
     def _getopt(self, option, config, section, template, default = None, opt_type = "char"):
         """ Init given configuration option """
@@ -138,7 +147,9 @@ class UnbufferedFileReader:
 
     def __init__(self, filename, log,
                  mode = "rb", encoding = None,
-                 start = None, end = None):
+                 start = None, end = None,
+                 skip_head_lines = 0,
+                 check_count = True):
         """ constructor """
         self.filename = filename
         self.log      = log
@@ -150,6 +161,12 @@ class UnbufferedFileReader:
         self.position = 0
         self.line_nb  = 0
 
+        # check_count can be set to False when phisical lines and logical
+        # lines counts can diverge, like in textreader.py
+        self.check_count = check_count
+        self.skip_head_lines = skip_head_lines
+        self.reading = self.skip_head_lines == 0
+
         # we don't yet force buffering, but...
         self.bufsize = -1
         
@@ -206,12 +223,33 @@ class UnbufferedFileReader:
             self.line_nb += 1
             self.position = self.fd.tell()
 
+            ##
+            # if -F is used, count lines to skip, and skip them
+            if self.skip_head_lines > 0:
+                if self.line_nb <= self.skip_head_lines:
+                    continue
+
+                if self.line_nb == self.skip_head_lines + 1:
+                    self.reading = True
+                    self.log.info('reached beginning on line %d', self.line_nb)
+
+
+            # check if we already processed COUNT lines
+            if self.check_count:
+                if COUNT is not None and self.reading \
+                   and (self.line_nb - self.skip_head_lines + 1) > COUNT:
+
+                    self.log.info('reached line %d, stopping', nb_lines)
+                    return
+
+            # check EOF (real or multi-readers)
             if line == '' or last_line_read:
                 self.log.debug("FileReader stoping, offset %d >= %s" \
                                % (self.position, self.end))
                 self.fd.close()
                 return
-            
+
+            # check multi-reader boundaries
             if self.end is not None and self.fd.tell() >= self.end:
                 # we want to process current line and stop at next
                 # iteration
diff --git a/pgloader/textreader.py b/pgloader/textreader.py
index 35f4718..78368e4 100644
--- a/pgloader/textreader.py
+++ b/pgloader/textreader.py
@@ -74,14 +74,16 @@ class TextReader(DataReader):
 
         ##
         # if neither -I nor -F was used, we can state that begin = 0
-        if FROM_ID is None and FROM_COUNT == 0:
+        if FROM_ID is None and self.skip_head_lines == 0:
             self.log.debug('beginning on first line')
             begin_linenb = 1
 
         self.fd = UnbufferedFileReader(self.filename, self.log,
-                                       encoding = self.input_encoding,
-                                       start    = self.start,
-                                       end      = self.end)
+                                       encoding        = self.input_encoding,
+                                       start           = self.start,
+                                       end             = self.end,
+                                       skip_head_lines = self.skip_head_lines,
+                                       check_count     = False)
         
         for line in self.fd:
             # we count real physical lines
@@ -142,16 +144,6 @@ class TextReader(DataReader):
             if self.start:
                 offsets = (self.start, offsets)
 
-            ##
-            # if -F is used, count lines to skip, and skip them
-            if FROM_COUNT > 0:
-                if nb_lines < FROM_COUNT:
-                    continue
-
-                if nb_lines == FROM_COUNT:
-                    begin_linenb = nb_lines
-                    self.log.info('reached beginning on line %d', nb_lines)
-
             ##
             # check for beginning if option -I was used
             if FROM_ID is not None:
@@ -168,7 +160,7 @@ class TextReader(DataReader):
                     # begin is set to 1 when we don't use neither -I nor -F
                     continue
 
-            if COUNT is not None and begin_linenb is not None \
+            if COUNT is not None and self.fd.reading \
                and (nb_lines - begin_linenb + 1) > COUNT:
                 
                 self.log.info('reached line %d, stopping', nb_lines)