Implement skip_head_lines in configuration file, some more bugfixes

This commit is contained in:
dim 2009-03-10 17:56:16 +00:00
parent 848595f49b
commit 090de905b2
14 changed files with 95 additions and 85 deletions

View File

@ -172,6 +172,9 @@ column. See +examples/fixed+.
== Facilities ==
Current status::
Partially implemented, +skip_head_lines+ is in CVS (2.3.2~dev1)
Add options:
+skip_head_lines+::

7
debian/changelog vendored
View File

@ -1,11 +1,14 @@
pgloader (2.3.2-1) unstable; urgency=low
pgloader (2.3.2~dev1-1) unstable; urgency=low
* Use psycopg cursor.copy_expert() when avaiable (> 2.0.6)
* FIX fixedreader: it now know about -C
* FIX Round Robin Reader with respect to offsets in readlines()
* allow python 2.3 to run pgloader when it does not need collections.deque (no Round Robin Reader)
* support python 2.3 if not using RRR (not importing collections.deque)
* change logger initialisation to support python 2.3
* FIX bad usage of STDERR in the code
* Implement skip_head_lines option in configuration (superseded by -F)
* Do not sort() section list when it's been given on command line
* Catch InterfaceError when trying to close connection
-- Dimitri Fontaine <dim@tapoueh.org> Wed, 17 Sep 2008 17:53:53 +0200

2
debian/control vendored
View File

@ -3,7 +3,7 @@ Section: misc
Priority: extra
Maintainer: Dimitri Fontaine <dim@tapoueh.org>
Build-Depends: debhelper (>= 5), docbook-to-man (>= 2.0.0), python-support (>= 0.3), xmlto, asciidoc (>= 0.8.2)
Standards-Version: 3.7.3
Standards-Version: 3.8.0
Homepage: http://pgloader.projects.postgresql.org
Package: pgloader

View File

@ -1,3 +1,4 @@
Stupid useless header with a © sign
"2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom"
"3.0.0.0","4.17.135.31","50331648","68257567","US","United States"
"4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada"

View File

@ -29,10 +29,11 @@ field_sep = |
trailing_sep = True
[simple]
use_template = simple_tmpl
table = simple
filename = simple/simple.data
columns = a:1, b:3, c:2
use_template = simple_tmpl
table = simple
filename = simple/simple.data
columns = a:1, b:3, c:2
skip_head_lines = 2
# those reject settings are defaults one
reject_log = /tmp/simple.rej.log
@ -114,11 +115,12 @@ fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
reformat = c:pgtime:time
[csv]
table = csv
format = csv
filename = csv/csv.data
field_sep = ,
quotechar = "
columns = x, y, a, b, d:6, c:5
only_cols = 3-6
table = csv
format = csv
filename = csv/csv.data
field_sep = ,
quotechar = "
columns = x, y, a, b, d:6, c:5
only_cols = 3-6
skip_head_lines = 1

View File

@ -1,3 +1,5 @@
This is a stupid useless header like you sometime find in CSV files
id|data|date|
1|some first row text|2006-11-11|
2|some second row text|13/11/2006|
3|some third row text|12-10-2006|

View File

@ -425,6 +425,11 @@ This parameter is optional and defaults to '\ ' (that is backslash
followed by space). If defined on a table level, this local value will
overwrite the global one.
skip_head_lines::
Skip the +n+ first lines of the given files (headers)
//////////////////////////////////////////
index::
+

View File

@ -493,6 +493,10 @@ def load_data():
if s != 'pgsql':
sections.append(s)
# we run through sorted section list, unless we got the section list
# from command line
sections.sort()
log.info('Will consider following sections:')
for line in myprint(sections):
log.info(line)
@ -500,9 +504,6 @@ def load_data():
# we count time passed from now on
begin = time.time()
# we run through sorted section list
sections.sort()
threads = {}
started = {}
finished = {}

View File

@ -70,43 +70,19 @@ class CSVReader(DataReader):
self.fd = UnbufferedFileReader(self.filename, self.log,
encoding = self.input_encoding,
start = self.start,
end = self.end)
end = self.end,
skip_head_lines = self.skip_head_lines)
# don't forget COUNT and FROM_COUNT option in CSV mode
nb_lines = 0
nb_lines = self.skip_head_lines
begin_linenb = None
last_line_nb = 1
##
# if -F was not used, we can state that begin = 0
#
# warning: FROM_ID is ignored
if FROM_COUNT == 0:
self.log.debug('beginning on first line')
begin_linenb = 1
# now read the lines
for columns in csv.reader(self.fd, dialect = 'pgloader'):
# we count logical lines
nb_lines += 1
##
# if -F is used, count lines to skip, and skip them
if FROM_COUNT > 0:
if nb_lines < FROM_COUNT:
continue
if nb_lines == FROM_COUNT:
begin_linenb = nb_lines
self.log.info('reached beginning on line %d', nb_lines)
# check if we already processed COUNT lines
if COUNT is not None and begin_linenb is not None \
and (nb_lines - begin_linenb + 1) > COUNT:
self.log.info('reached line %d, stopping', nb_lines)
return
line = self.field_sep.join(columns)
offsets = range(last_line_nb, self.fd.line_nb)
last_line_nb = self.fd.line_nb

View File

@ -97,10 +97,14 @@ class db:
try:
self.log.info('closing current database connection')
except IOError, e:
# Ignore no space left on device etc here
# Ignore no space left on device...
pass
try:
self.dbconn.close()
except InterfaceError, e:
# Ignore connection already closed
pass
self.dbconn.close()
self.dbconn = None
def set_encoding(self):

View File

@ -63,20 +63,13 @@ class FixedReader(DataReader):
self.fd = UnbufferedFileReader(self.filename, self.log,
encoding = self.input_encoding,
start = self.start,
end = self.end)
end = self.end,
skip_head_lines = self.skip_head_lines)
# don't forget COUNT and FROM_COUNT option
nb_lines = 0
nb_lines = self.skip_head_lines
begin_linenb = None
##
# if -F was not used, we can state that begin = 0
#
# warning: FROM_ID is ignored
if FROM_COUNT == 0:
self.log.debug('beginning on first line')
begin_linenb = 1
for line in self.fd:
line = line.strip("\n")
llen = len(line)
@ -84,16 +77,6 @@ class FixedReader(DataReader):
offsets = [self.fd.line_nb]
nb_lines += 1
##
# if -F is used, count lines to skip, and skip them
if FROM_COUNT > 0:
if nb_lines < FROM_COUNT:
continue
if nb_lines == FROM_COUNT:
begin_linenb = nb_lines
self.log.info('reached beginning on line %d', nb_lines)
for cname, cpos in self.columns:
start, length = self.positions[cname]

View File

@ -2,7 +2,7 @@
#
# Some common options, for each module to get them
PGLOADER_VERSION = '2.3.2'
PGLOADER_VERSION = '2.3.2~dev1'
PSYCOPG_VERSION = None

View File

@ -67,6 +67,13 @@ class DataReader:
self._getopt('field_sep', config, name, template, FIELD_SEP)
self.field_sep = self.field_sep.decode('string-escape')
##
# FROM_COUNT takes precedence over skip_head_lines
if FROM_COUNT is None or FROM_COUNT == 0:
self._getopt('skip_head_lines', config, name, template, 0, 'int')
else:
self.skip_head_lines = FROM_COUNT - 1
if len(self.field_sep) != 1:
raise PGLoader_Error, "field_sep must be 1 char, not %d (%s)" \
% (len(self.field_sep), self.field_sep)
@ -82,6 +89,8 @@ class DataReader:
self.log.debug("reader.db %s copy_sep %s" % (self.db, self.db.copy_sep))
self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep)
self.log.debug("reader.readconfig skip_head_lines: %d",
self.skip_head_lines)
def _getopt(self, option, config, section, template, default = None, opt_type = "char"):
""" Init given configuration option """
@ -138,7 +147,9 @@ class UnbufferedFileReader:
def __init__(self, filename, log,
mode = "rb", encoding = None,
start = None, end = None):
start = None, end = None,
skip_head_lines = 0,
check_count = True):
""" constructor """
self.filename = filename
self.log = log
@ -150,6 +161,12 @@ class UnbufferedFileReader:
self.position = 0
self.line_nb = 0
# check_count can be set to False when phisical lines and logical
# lines counts can diverge, like in textreader.py
self.check_count = check_count
self.skip_head_lines = skip_head_lines
self.reading = self.skip_head_lines == 0
# we don't yet force buffering, but...
self.bufsize = -1
@ -206,12 +223,33 @@ class UnbufferedFileReader:
self.line_nb += 1
self.position = self.fd.tell()
##
# if -F is used, count lines to skip, and skip them
if self.skip_head_lines > 0:
if self.line_nb <= self.skip_head_lines:
continue
if self.line_nb == self.skip_head_lines + 1:
self.reading = True
self.log.info('reached beginning on line %d', self.line_nb)
# check if we already processed COUNT lines
if self.check_count:
if COUNT is not None and self.reading \
and (self.line_nb - self.skip_head_lines + 1) > COUNT:
self.log.info('reached line %d, stopping', nb_lines)
return
# check EOF (real or multi-readers)
if line == '' or last_line_read:
self.log.debug("FileReader stoping, offset %d >= %s" \
% (self.position, self.end))
self.fd.close()
return
# check multi-reader boundaries
if self.end is not None and self.fd.tell() >= self.end:
# we want to process current line and stop at next
# iteration

View File

@ -74,14 +74,16 @@ class TextReader(DataReader):
##
# if neither -I nor -F was used, we can state that begin = 0
if FROM_ID is None and FROM_COUNT == 0:
if FROM_ID is None and self.skip_head_lines == 0:
self.log.debug('beginning on first line')
begin_linenb = 1
self.fd = UnbufferedFileReader(self.filename, self.log,
encoding = self.input_encoding,
start = self.start,
end = self.end)
encoding = self.input_encoding,
start = self.start,
end = self.end,
skip_head_lines = self.skip_head_lines,
check_count = False)
for line in self.fd:
# we count real physical lines
@ -142,16 +144,6 @@ class TextReader(DataReader):
if self.start:
offsets = (self.start, offsets)
##
# if -F is used, count lines to skip, and skip them
if FROM_COUNT > 0:
if nb_lines < FROM_COUNT:
continue
if nb_lines == FROM_COUNT:
begin_linenb = nb_lines
self.log.info('reached beginning on line %d', nb_lines)
##
# check for beginning if option -I was used
if FROM_ID is not None:
@ -168,7 +160,7 @@ class TextReader(DataReader):
# begin is set to 1 when we don't use neither -I nor -F
continue
if COUNT is not None and begin_linenb is not None \
if COUNT is not None and self.fd.reading \
and (nb_lines - begin_linenb + 1) > COUNT:
self.log.info('reached line %d, stopping', nb_lines)