From 090de905b25451713eb6f7bddf42dacd9925682f Mon Sep 17 00:00:00 2001 From: dim Date: Tue, 10 Mar 2009 17:56:16 +0000 Subject: [PATCH] Implement skip_head_lines in configuration file, some more bugfixes --- TODO.txt | 3 +++ debian/changelog | 7 +++++-- debian/control | 2 +- examples/csv/csv.data | 1 + examples/pgloader.conf | 24 +++++++++++---------- examples/simple/simple.data | 2 ++ pgloader.1.txt | 5 +++++ pgloader.py | 7 ++++--- pgloader/csvreader.py | 30 +++----------------------- pgloader/db.py | 10 ++++++--- pgloader/fixedreader.py | 23 +++----------------- pgloader/options.py | 2 +- pgloader/reader.py | 42 +++++++++++++++++++++++++++++++++++-- pgloader/textreader.py | 22 +++++++------------ 14 files changed, 95 insertions(+), 85 deletions(-) diff --git a/TODO.txt b/TODO.txt index 7cefaa9..250a42c 100644 --- a/TODO.txt +++ b/TODO.txt @@ -172,6 +172,9 @@ column. See +examples/fixed+. == Facilities == +Current status:: + Partially implemented, +skip_head_lines+ is in CVS (2.3.2~dev1) + Add options: +skip_head_lines+:: diff --git a/debian/changelog b/debian/changelog index d259c57..a64b4b5 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,11 +1,14 @@ -pgloader (2.3.2-1) unstable; urgency=low +pgloader (2.3.2~dev1-1) unstable; urgency=low * Use psycopg cursor.copy_expert() when avaiable (> 2.0.6) * FIX fixedreader: it now know about -C * FIX Round Robin Reader with respect to offsets in readlines() - * allow python 2.3 to run pgloader when it does not need collections.deque (no Round Robin Reader) + * support python 2.3 if not using RRR (not importing collections.deque) * change logger initialisation to support python 2.3 * FIX bad usage of STDERR in the code + * Implement skip_head_lines option in configuration (superseded by -F) + * Do not sort() section list when it's been given on command line + * Catch InterfaceError when trying to close connection -- Dimitri Fontaine Wed, 17 Sep 2008 17:53:53 +0200 diff --git a/debian/control b/debian/control index 36f6f36..0ff852e 100644 --- a/debian/control +++ b/debian/control @@ -3,7 +3,7 @@ Section: misc Priority: extra Maintainer: Dimitri Fontaine Build-Depends: debhelper (>= 5), docbook-to-man (>= 2.0.0), python-support (>= 0.3), xmlto, asciidoc (>= 0.8.2) -Standards-Version: 3.7.3 +Standards-Version: 3.8.0 Homepage: http://pgloader.projects.postgresql.org Package: pgloader diff --git a/examples/csv/csv.data b/examples/csv/csv.data index d6e01b3..3b472b8 100644 --- a/examples/csv/csv.data +++ b/examples/csv/csv.data @@ -1,3 +1,4 @@ +Stupid useless header with a © sign "2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom" "3.0.0.0","4.17.135.31","50331648","68257567","US","United States" "4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada" diff --git a/examples/pgloader.conf b/examples/pgloader.conf index 4e31b5f..d70d798 100644 --- a/examples/pgloader.conf +++ b/examples/pgloader.conf @@ -29,10 +29,11 @@ field_sep = | trailing_sep = True [simple] -use_template = simple_tmpl -table = simple -filename = simple/simple.data -columns = a:1, b:3, c:2 +use_template = simple_tmpl +table = simple +filename = simple/simple.data +columns = a:1, b:3, c:2 +skip_head_lines = 2 # those reject settings are defaults one reject_log = /tmp/simple.rej.log @@ -114,11 +115,12 @@ fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17 reformat = c:pgtime:time [csv] -table = csv -format = csv -filename = csv/csv.data -field_sep = , -quotechar = " -columns = x, y, a, b, d:6, c:5 -only_cols = 3-6 +table = csv +format = csv +filename = csv/csv.data +field_sep = , +quotechar = " +columns = x, y, a, b, d:6, c:5 +only_cols = 3-6 +skip_head_lines = 1 diff --git a/examples/simple/simple.data b/examples/simple/simple.data index adfb1b1..537de6f 100644 --- a/examples/simple/simple.data +++ b/examples/simple/simple.data @@ -1,3 +1,5 @@ +This is a stupid useless header like you sometime find in CSV files +id|data|date| 1|some first row text|2006-11-11| 2|some second row text|13/11/2006| 3|some third row text|12-10-2006| diff --git a/pgloader.1.txt b/pgloader.1.txt index 94e2c20..121bc4a 100644 --- a/pgloader.1.txt +++ b/pgloader.1.txt @@ -425,6 +425,11 @@ This parameter is optional and defaults to '\ ' (that is backslash followed by space). If defined on a table level, this local value will overwrite the global one. +skip_head_lines:: + + Skip the +n+ first lines of the given files (headers) + + ////////////////////////////////////////// index:: + diff --git a/pgloader.py b/pgloader.py index ed9653f..a0f3ed6 100644 --- a/pgloader.py +++ b/pgloader.py @@ -493,6 +493,10 @@ def load_data(): if s != 'pgsql': sections.append(s) + # we run through sorted section list, unless we got the section list + # from command line + sections.sort() + log.info('Will consider following sections:') for line in myprint(sections): log.info(line) @@ -500,9 +504,6 @@ def load_data(): # we count time passed from now on begin = time.time() - # we run through sorted section list - sections.sort() - threads = {} started = {} finished = {} diff --git a/pgloader/csvreader.py b/pgloader/csvreader.py index ecd71c6..42f692c 100644 --- a/pgloader/csvreader.py +++ b/pgloader/csvreader.py @@ -70,43 +70,19 @@ class CSVReader(DataReader): self.fd = UnbufferedFileReader(self.filename, self.log, encoding = self.input_encoding, start = self.start, - end = self.end) + end = self.end, + skip_head_lines = self.skip_head_lines) # don't forget COUNT and FROM_COUNT option in CSV mode - nb_lines = 0 + nb_lines = self.skip_head_lines begin_linenb = None last_line_nb = 1 - ## - # if -F was not used, we can state that begin = 0 - # - # warning: FROM_ID is ignored - if FROM_COUNT == 0: - self.log.debug('beginning on first line') - begin_linenb = 1 - # now read the lines for columns in csv.reader(self.fd, dialect = 'pgloader'): # we count logical lines nb_lines += 1 - ## - # if -F is used, count lines to skip, and skip them - if FROM_COUNT > 0: - if nb_lines < FROM_COUNT: - continue - - if nb_lines == FROM_COUNT: - begin_linenb = nb_lines - self.log.info('reached beginning on line %d', nb_lines) - - # check if we already processed COUNT lines - if COUNT is not None and begin_linenb is not None \ - and (nb_lines - begin_linenb + 1) > COUNT: - - self.log.info('reached line %d, stopping', nb_lines) - return - line = self.field_sep.join(columns) offsets = range(last_line_nb, self.fd.line_nb) last_line_nb = self.fd.line_nb diff --git a/pgloader/db.py b/pgloader/db.py index 8e9fb09..1b7421c 100644 --- a/pgloader/db.py +++ b/pgloader/db.py @@ -97,10 +97,14 @@ class db: try: self.log.info('closing current database connection') except IOError, e: - # Ignore no space left on device etc here + # Ignore no space left on device... + pass + + try: + self.dbconn.close() + except InterfaceError, e: + # Ignore connection already closed pass - - self.dbconn.close() self.dbconn = None def set_encoding(self): diff --git a/pgloader/fixedreader.py b/pgloader/fixedreader.py index 8356682..2a28a94 100644 --- a/pgloader/fixedreader.py +++ b/pgloader/fixedreader.py @@ -63,20 +63,13 @@ class FixedReader(DataReader): self.fd = UnbufferedFileReader(self.filename, self.log, encoding = self.input_encoding, start = self.start, - end = self.end) + end = self.end, + skip_head_lines = self.skip_head_lines) # don't forget COUNT and FROM_COUNT option - nb_lines = 0 + nb_lines = self.skip_head_lines begin_linenb = None - ## - # if -F was not used, we can state that begin = 0 - # - # warning: FROM_ID is ignored - if FROM_COUNT == 0: - self.log.debug('beginning on first line') - begin_linenb = 1 - for line in self.fd: line = line.strip("\n") llen = len(line) @@ -84,16 +77,6 @@ class FixedReader(DataReader): offsets = [self.fd.line_nb] nb_lines += 1 - ## - # if -F is used, count lines to skip, and skip them - if FROM_COUNT > 0: - if nb_lines < FROM_COUNT: - continue - - if nb_lines == FROM_COUNT: - begin_linenb = nb_lines - self.log.info('reached beginning on line %d', nb_lines) - for cname, cpos in self.columns: start, length = self.positions[cname] diff --git a/pgloader/options.py b/pgloader/options.py index 62193a4..3e8202e 100644 --- a/pgloader/options.py +++ b/pgloader/options.py @@ -2,7 +2,7 @@ # # Some common options, for each module to get them -PGLOADER_VERSION = '2.3.2' +PGLOADER_VERSION = '2.3.2~dev1' PSYCOPG_VERSION = None diff --git a/pgloader/reader.py b/pgloader/reader.py index 4eff4bb..4b8bcaf 100644 --- a/pgloader/reader.py +++ b/pgloader/reader.py @@ -67,6 +67,13 @@ class DataReader: self._getopt('field_sep', config, name, template, FIELD_SEP) self.field_sep = self.field_sep.decode('string-escape') + ## + # FROM_COUNT takes precedence over skip_head_lines + if FROM_COUNT is None or FROM_COUNT == 0: + self._getopt('skip_head_lines', config, name, template, 0, 'int') + else: + self.skip_head_lines = FROM_COUNT - 1 + if len(self.field_sep) != 1: raise PGLoader_Error, "field_sep must be 1 char, not %d (%s)" \ % (len(self.field_sep), self.field_sep) @@ -82,6 +89,8 @@ class DataReader: self.log.debug("reader.db %s copy_sep %s" % (self.db, self.db.copy_sep)) self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep) + self.log.debug("reader.readconfig skip_head_lines: %d", + self.skip_head_lines) def _getopt(self, option, config, section, template, default = None, opt_type = "char"): """ Init given configuration option """ @@ -138,7 +147,9 @@ class UnbufferedFileReader: def __init__(self, filename, log, mode = "rb", encoding = None, - start = None, end = None): + start = None, end = None, + skip_head_lines = 0, + check_count = True): """ constructor """ self.filename = filename self.log = log @@ -150,6 +161,12 @@ class UnbufferedFileReader: self.position = 0 self.line_nb = 0 + # check_count can be set to False when phisical lines and logical + # lines counts can diverge, like in textreader.py + self.check_count = check_count + self.skip_head_lines = skip_head_lines + self.reading = self.skip_head_lines == 0 + # we don't yet force buffering, but... self.bufsize = -1 @@ -206,12 +223,33 @@ class UnbufferedFileReader: self.line_nb += 1 self.position = self.fd.tell() + ## + # if -F is used, count lines to skip, and skip them + if self.skip_head_lines > 0: + if self.line_nb <= self.skip_head_lines: + continue + + if self.line_nb == self.skip_head_lines + 1: + self.reading = True + self.log.info('reached beginning on line %d', self.line_nb) + + + # check if we already processed COUNT lines + if self.check_count: + if COUNT is not None and self.reading \ + and (self.line_nb - self.skip_head_lines + 1) > COUNT: + + self.log.info('reached line %d, stopping', nb_lines) + return + + # check EOF (real or multi-readers) if line == '' or last_line_read: self.log.debug("FileReader stoping, offset %d >= %s" \ % (self.position, self.end)) self.fd.close() return - + + # check multi-reader boundaries if self.end is not None and self.fd.tell() >= self.end: # we want to process current line and stop at next # iteration diff --git a/pgloader/textreader.py b/pgloader/textreader.py index 35f4718..78368e4 100644 --- a/pgloader/textreader.py +++ b/pgloader/textreader.py @@ -74,14 +74,16 @@ class TextReader(DataReader): ## # if neither -I nor -F was used, we can state that begin = 0 - if FROM_ID is None and FROM_COUNT == 0: + if FROM_ID is None and self.skip_head_lines == 0: self.log.debug('beginning on first line') begin_linenb = 1 self.fd = UnbufferedFileReader(self.filename, self.log, - encoding = self.input_encoding, - start = self.start, - end = self.end) + encoding = self.input_encoding, + start = self.start, + end = self.end, + skip_head_lines = self.skip_head_lines, + check_count = False) for line in self.fd: # we count real physical lines @@ -142,16 +144,6 @@ class TextReader(DataReader): if self.start: offsets = (self.start, offsets) - ## - # if -F is used, count lines to skip, and skip them - if FROM_COUNT > 0: - if nb_lines < FROM_COUNT: - continue - - if nb_lines == FROM_COUNT: - begin_linenb = nb_lines - self.log.info('reached beginning on line %d', nb_lines) - ## # check for beginning if option -I was used if FROM_ID is not None: @@ -168,7 +160,7 @@ class TextReader(DataReader): # begin is set to 1 when we don't use neither -I nor -F continue - if COUNT is not None and begin_linenb is not None \ + if COUNT is not None and self.fd.reading \ and (nb_lines - begin_linenb + 1) > COUNT: self.log.info('reached line %d, stopping', nb_lines)