Implement skip_head_lines in configuration file, some more bugfixes

2026-05-04 18:36:12 +02:00 · 2009-03-10 17:56:16 +00:00 · 2009-03-10 17:56:16 +00:00 · 090de905b2
commit 090de905b2
parent 848595f49b
14 changed files with 95 additions and 85 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -172,6 +172,9 @@ column. See +examples/fixed+.

 == Facilities ==

+Current status::
+  Partially implemented, +skip_head_lines+ is in CVS (2.3.2~dev1)
+
 Add options:

 +skip_head_lines+::
--- a/debian/changelog
+++ b/debian/changelog
@ -1,11 +1,14 @@
-pgloader (2.3.2-1) unstable; urgency=low
+pgloader (2.3.2~dev1-1) unstable; urgency=low

  * Use psycopg cursor.copy_expert() when avaiable (> 2.0.6)
  * FIX fixedreader: it now know about -C
  * FIX Round Robin Reader with respect to offsets in readlines()
-  * allow python 2.3 to run pgloader when it does not need collections.deque (no Round Robin Reader)
+  * support python 2.3 if not using RRR (not importing collections.deque)
  * change logger initialisation to support python 2.3
  * FIX bad usage of STDERR in the code
+  * Implement skip_head_lines option in configuration (superseded by -F)
+  * Do not sort() section list when it's been given on command line
+  * Catch InterfaceError when trying to close connection

 -- Dimitri Fontaine <dim@tapoueh.org>  Wed, 17 Sep 2008 17:53:53 +0200

--- a/debian/control
+++ b/debian/control
@ -3,7 +3,7 @@ Section: misc
 Priority: extra
 Maintainer: Dimitri Fontaine <dim@tapoueh.org>
 Build-Depends: debhelper (>= 5), docbook-to-man (>= 2.0.0), python-support (>= 0.3), xmlto, asciidoc (>= 0.8.2)
-Standards-Version: 3.7.3
+Standards-Version: 3.8.0
 Homepage: http://pgloader.projects.postgresql.org

 Package: pgloader
--- a/examples/csv/csv.data
+++ b/examples/csv/csv.data
@ -1,3 +1,4 @@
+Stupid useless header with a © sign
 "2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom"
 "3.0.0.0","4.17.135.31","50331648","68257567","US","United States"
 "4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada"
--- a/examples/pgloader.conf
+++ b/examples/pgloader.conf
@ -29,10 +29,11 @@ field_sep    = |
 trailing_sep = True

 [simple]
-use_template = simple_tmpl
-table        = simple
-filename     = simple/simple.data
-columns      = a:1, b:3, c:2
+use_template    = simple_tmpl
+table           = simple
+filename        = simple/simple.data
+columns         = a:1, b:3, c:2
+skip_head_lines = 2

 # those reject settings are defaults one
 reject_log   = /tmp/simple.rej.log
@ -114,11 +115,12 @@ fixed_specs     = a:0:10, b:10:8, c:18:8, d:26:17
 reformat        = c:pgtime:time

 [csv]
-table        = csv
-format       = csv
-filename     = csv/csv.data
-field_sep    = ,
-quotechar    = "
-columns      = x, y, a, b, d:6, c:5
-only_cols    = 3-6
+table           = csv
+format          = csv
+filename        = csv/csv.data
+field_sep       = ,
+quotechar       = "
+columns         = x, y, a, b, d:6, c:5
+only_cols       = 3-6
+skip_head_lines = 1

--- a/examples/simple/simple.data
+++ b/examples/simple/simple.data
@ -1,3 +1,5 @@
+This is a stupid useless header like you sometime find in CSV files
+id|data|date|
 1|some first row text|2006-11-11|
 2|some second row text|13/11/2006|
 3|some third row text|12-10-2006|
--- a/pgloader.1.txt
+++ b/pgloader.1.txt
@ -425,6 +425,11 @@ This parameter is optional and defaults to '\ ' (that is backslash
 followed by space). If defined on a table level, this local value will
 overwrite the global one.

+skip_head_lines::
+
+  Skip the +n+ first lines of the given files (headers)
+
+
 //////////////////////////////////////////
 index::
 +
--- a/pgloader.py
+++ b/pgloader.py
@ -493,6 +493,10 @@ def load_data():
            if s != 'pgsql':
                sections.append(s)

+        # we run through sorted section list, unless we got the section list
+        # from command line
+        sections.sort()
+
    log.info('Will consider following sections:')
    for line in myprint(sections):
        log.info(line)
@ -500,9 +504,6 @@ def load_data():
    # we count time passed from now on
    begin = time.time()

-    # we run through sorted section list
-    sections.sort()
-
    threads  = {}
    started  = {}
    finished = {}
--- a/pgloader/csvreader.py
+++ b/pgloader/csvreader.py
@ -70,43 +70,19 @@ class CSVReader(DataReader):
        self.fd = UnbufferedFileReader(self.filename, self.log,
                                       encoding = self.input_encoding,
                                       start    = self.start,
-                                       end      = self.end)
+                                       end      = self.end,
+                                       skip_head_lines = self.skip_head_lines)
        
        # don't forget COUNT and FROM_COUNT option in CSV mode
-        nb_lines     = 0
+        nb_lines     = self.skip_head_lines
        begin_linenb = None
        last_line_nb = 1

-        ##
-        # if -F was not used, we can state that begin = 0
-        #
-        # warning: FROM_ID is ignored
-        if FROM_COUNT == 0:
-            self.log.debug('beginning on first line')
-            begin_linenb = 1
-        
        # now read the lines
        for columns in csv.reader(self.fd, dialect = 'pgloader'):
            # we count logical lines
            nb_lines += 1

-            ##
-            # if -F is used, count lines to skip, and skip them
-            if FROM_COUNT > 0:
-                if nb_lines < FROM_COUNT:
-                    continue
-
-                if nb_lines == FROM_COUNT:
-                    begin_linenb = nb_lines
-                    self.log.info('reached beginning on line %d', nb_lines)
-
-            # check if we already processed COUNT lines
-            if COUNT is not None and begin_linenb is not None \
-               and (nb_lines - begin_linenb + 1) > COUNT:
-                
-                self.log.info('reached line %d, stopping', nb_lines)
-                return
-                    
            line         = self.field_sep.join(columns)
            offsets      = range(last_line_nb, self.fd.line_nb)
            last_line_nb = self.fd.line_nb
--- a/pgloader/db.py
+++ b/pgloader/db.py
@ -97,10 +97,14 @@ class db:
            try:
                self.log.info('closing current database connection')
            except IOError, e:
-                # Ignore no space left on device etc here
+                # Ignore no space left on device...
+                pass
+
+            try:
+                self.dbconn.close()
+            except InterfaceError, e:
+                # Ignore connection already closed
                pass
-            
-            self.dbconn.close()
            self.dbconn = None

    def set_encoding(self):
--- a/pgloader/fixedreader.py
+++ b/pgloader/fixedreader.py
@ -63,20 +63,13 @@ class FixedReader(DataReader):
        self.fd = UnbufferedFileReader(self.filename, self.log,
                                       encoding = self.input_encoding,
                                       start    = self.start,
-                                       end      = self.end)
+                                       end      = self.end,
+                                       skip_head_lines = self.skip_head_lines)

        # don't forget COUNT and FROM_COUNT option
-        nb_lines     = 0
+        nb_lines     = self.skip_head_lines
        begin_linenb = None

-        ##
-        # if -F was not used, we can state that begin = 0
-        #
-        # warning: FROM_ID is ignored
-        if FROM_COUNT == 0:
-            self.log.debug('beginning on first line')
-            begin_linenb = 1
-            
        for line in self.fd:
            line      = line.strip("\n")
            llen      = len(line)
@ -84,16 +77,6 @@ class FixedReader(DataReader):
            offsets   = [self.fd.line_nb]
            nb_lines += 1

-            ##
-            # if -F is used, count lines to skip, and skip them
-            if FROM_COUNT > 0:
-                if nb_lines < FROM_COUNT:
-                    continue
-
-                if nb_lines == FROM_COUNT:
-                    begin_linenb = nb_lines
-                    self.log.info('reached beginning on line %d', nb_lines)
-
            for cname, cpos in self.columns:
                start, length = self.positions[cname]

--- a/pgloader/options.py
+++ b/pgloader/options.py
@ -2,7 +2,7 @@
 #
 # Some common options, for each module to get them

-PGLOADER_VERSION = '2.3.2'
+PGLOADER_VERSION = '2.3.2~dev1'

 PSYCOPG_VERSION = None

--- a/pgloader/reader.py
+++ b/pgloader/reader.py
@ -67,6 +67,13 @@ class DataReader:
        self._getopt('field_sep', config, name, template, FIELD_SEP)
        self.field_sep = self.field_sep.decode('string-escape')

+        ##
+        # FROM_COUNT takes precedence over skip_head_lines
+        if FROM_COUNT is None or FROM_COUNT == 0:
+            self._getopt('skip_head_lines', config, name, template, 0, 'int')
+        else:
+            self.skip_head_lines = FROM_COUNT - 1
+
        if len(self.field_sep) != 1:
            raise PGLoader_Error, "field_sep must be 1 char, not %d (%s)" \
                  % (len(self.field_sep), self.field_sep)
@ -82,6 +89,8 @@ class DataReader:
            self.log.debug("reader.db %s copy_sep %s" % (self.db, self.db.copy_sep))
            
        self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep)
+        self.log.debug("reader.readconfig skip_head_lines: %d",
+                       self.skip_head_lines)

    def _getopt(self, option, config, section, template, default = None, opt_type = "char"):
        """ Init given configuration option """
@ -138,7 +147,9 @@ class UnbufferedFileReader:

    def __init__(self, filename, log,
                 mode = "rb", encoding = None,
-                 start = None, end = None):
+                 start = None, end = None,
+                 skip_head_lines = 0,
+                 check_count = True):
        """ constructor """
        self.filename = filename
        self.log      = log
@ -150,6 +161,12 @@ class UnbufferedFileReader:
        self.position = 0
        self.line_nb  = 0

+        # check_count can be set to False when phisical lines and logical
+        # lines counts can diverge, like in textreader.py
+        self.check_count = check_count
+        self.skip_head_lines = skip_head_lines
+        self.reading = self.skip_head_lines == 0
+
        # we don't yet force buffering, but...
        self.bufsize = -1
        
@ -206,12 +223,33 @@ class UnbufferedFileReader:
            self.line_nb += 1
            self.position = self.fd.tell()

+            ##
+            # if -F is used, count lines to skip, and skip them
+            if self.skip_head_lines > 0:
+                if self.line_nb <= self.skip_head_lines:
+                    continue
+
+                if self.line_nb == self.skip_head_lines + 1:
+                    self.reading = True
+                    self.log.info('reached beginning on line %d', self.line_nb)
+
+
+            # check if we already processed COUNT lines
+            if self.check_count:
+                if COUNT is not None and self.reading \
+                   and (self.line_nb - self.skip_head_lines + 1) > COUNT:
+
+                    self.log.info('reached line %d, stopping', nb_lines)
+                    return
+
+            # check EOF (real or multi-readers)
            if line == '' or last_line_read:
                self.log.debug("FileReader stoping, offset %d >= %s" \
                               % (self.position, self.end))
                self.fd.close()
                return
-            
+
+            # check multi-reader boundaries
            if self.end is not None and self.fd.tell() >= self.end:
                # we want to process current line and stop at next
                # iteration
--- a/pgloader/textreader.py
+++ b/pgloader/textreader.py
@ -74,14 +74,16 @@ class TextReader(DataReader):

        ##
        # if neither -I nor -F was used, we can state that begin = 0
-        if FROM_ID is None and FROM_COUNT == 0:
+        if FROM_ID is None and self.skip_head_lines == 0:
            self.log.debug('beginning on first line')
            begin_linenb = 1

        self.fd = UnbufferedFileReader(self.filename, self.log,
-                                       encoding = self.input_encoding,
-                                       start    = self.start,
-                                       end      = self.end)
+                                       encoding        = self.input_encoding,
+                                       start           = self.start,
+                                       end             = self.end,
+                                       skip_head_lines = self.skip_head_lines,
+                                       check_count     = False)
        
        for line in self.fd:
            # we count real physical lines
@ -142,16 +144,6 @@ class TextReader(DataReader):
            if self.start:
                offsets = (self.start, offsets)

-            ##
-            # if -F is used, count lines to skip, and skip them
-            if FROM_COUNT > 0:
-                if nb_lines < FROM_COUNT:
-                    continue
-
-                if nb_lines == FROM_COUNT:
-                    begin_linenb = nb_lines
-                    self.log.info('reached beginning on line %d', nb_lines)
-
            ##
            # check for beginning if option -I was used
            if FROM_ID is not None:
@ -168,7 +160,7 @@ class TextReader(DataReader):
                    # begin is set to 1 when we don't use neither -I nor -F
                    continue

-            if COUNT is not None and begin_linenb is not None \
+            if COUNT is not None and self.fd.reading \
               and (nb_lines - begin_linenb + 1) > COUNT:
                
                self.log.info('reached line %d, stopping', nb_lines)