mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-04 18:36:12 +02:00
Implement skip_head_lines in configuration file, some more bugfixes
This commit is contained in:
parent
848595f49b
commit
090de905b2
3
TODO.txt
3
TODO.txt
@ -172,6 +172,9 @@ column. See +examples/fixed+.
|
||||
|
||||
== Facilities ==
|
||||
|
||||
Current status::
|
||||
Partially implemented, +skip_head_lines+ is in CVS (2.3.2~dev1)
|
||||
|
||||
Add options:
|
||||
|
||||
+skip_head_lines+::
|
||||
|
||||
7
debian/changelog
vendored
7
debian/changelog
vendored
@ -1,11 +1,14 @@
|
||||
pgloader (2.3.2-1) unstable; urgency=low
|
||||
pgloader (2.3.2~dev1-1) unstable; urgency=low
|
||||
|
||||
* Use psycopg cursor.copy_expert() when avaiable (> 2.0.6)
|
||||
* FIX fixedreader: it now know about -C
|
||||
* FIX Round Robin Reader with respect to offsets in readlines()
|
||||
* allow python 2.3 to run pgloader when it does not need collections.deque (no Round Robin Reader)
|
||||
* support python 2.3 if not using RRR (not importing collections.deque)
|
||||
* change logger initialisation to support python 2.3
|
||||
* FIX bad usage of STDERR in the code
|
||||
* Implement skip_head_lines option in configuration (superseded by -F)
|
||||
* Do not sort() section list when it's been given on command line
|
||||
* Catch InterfaceError when trying to close connection
|
||||
|
||||
-- Dimitri Fontaine <dim@tapoueh.org> Wed, 17 Sep 2008 17:53:53 +0200
|
||||
|
||||
|
||||
2
debian/control
vendored
2
debian/control
vendored
@ -3,7 +3,7 @@ Section: misc
|
||||
Priority: extra
|
||||
Maintainer: Dimitri Fontaine <dim@tapoueh.org>
|
||||
Build-Depends: debhelper (>= 5), docbook-to-man (>= 2.0.0), python-support (>= 0.3), xmlto, asciidoc (>= 0.8.2)
|
||||
Standards-Version: 3.7.3
|
||||
Standards-Version: 3.8.0
|
||||
Homepage: http://pgloader.projects.postgresql.org
|
||||
|
||||
Package: pgloader
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
Stupid useless header with a © sign
|
||||
"2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom"
|
||||
"3.0.0.0","4.17.135.31","50331648","68257567","US","United States"
|
||||
"4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada"
|
||||
|
||||
@ -29,10 +29,11 @@ field_sep = |
|
||||
trailing_sep = True
|
||||
|
||||
[simple]
|
||||
use_template = simple_tmpl
|
||||
table = simple
|
||||
filename = simple/simple.data
|
||||
columns = a:1, b:3, c:2
|
||||
use_template = simple_tmpl
|
||||
table = simple
|
||||
filename = simple/simple.data
|
||||
columns = a:1, b:3, c:2
|
||||
skip_head_lines = 2
|
||||
|
||||
# those reject settings are defaults one
|
||||
reject_log = /tmp/simple.rej.log
|
||||
@ -114,11 +115,12 @@ fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
|
||||
reformat = c:pgtime:time
|
||||
|
||||
[csv]
|
||||
table = csv
|
||||
format = csv
|
||||
filename = csv/csv.data
|
||||
field_sep = ,
|
||||
quotechar = "
|
||||
columns = x, y, a, b, d:6, c:5
|
||||
only_cols = 3-6
|
||||
table = csv
|
||||
format = csv
|
||||
filename = csv/csv.data
|
||||
field_sep = ,
|
||||
quotechar = "
|
||||
columns = x, y, a, b, d:6, c:5
|
||||
only_cols = 3-6
|
||||
skip_head_lines = 1
|
||||
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
This is a stupid useless header like you sometime find in CSV files
|
||||
id|data|date|
|
||||
1|some first row text|2006-11-11|
|
||||
2|some second row text|13/11/2006|
|
||||
3|some third row text|12-10-2006|
|
||||
|
||||
@ -425,6 +425,11 @@ This parameter is optional and defaults to '\ ' (that is backslash
|
||||
followed by space). If defined on a table level, this local value will
|
||||
overwrite the global one.
|
||||
|
||||
skip_head_lines::
|
||||
|
||||
Skip the +n+ first lines of the given files (headers)
|
||||
|
||||
|
||||
//////////////////////////////////////////
|
||||
index::
|
||||
+
|
||||
|
||||
@ -493,6 +493,10 @@ def load_data():
|
||||
if s != 'pgsql':
|
||||
sections.append(s)
|
||||
|
||||
# we run through sorted section list, unless we got the section list
|
||||
# from command line
|
||||
sections.sort()
|
||||
|
||||
log.info('Will consider following sections:')
|
||||
for line in myprint(sections):
|
||||
log.info(line)
|
||||
@ -500,9 +504,6 @@ def load_data():
|
||||
# we count time passed from now on
|
||||
begin = time.time()
|
||||
|
||||
# we run through sorted section list
|
||||
sections.sort()
|
||||
|
||||
threads = {}
|
||||
started = {}
|
||||
finished = {}
|
||||
|
||||
@ -70,43 +70,19 @@ class CSVReader(DataReader):
|
||||
self.fd = UnbufferedFileReader(self.filename, self.log,
|
||||
encoding = self.input_encoding,
|
||||
start = self.start,
|
||||
end = self.end)
|
||||
end = self.end,
|
||||
skip_head_lines = self.skip_head_lines)
|
||||
|
||||
# don't forget COUNT and FROM_COUNT option in CSV mode
|
||||
nb_lines = 0
|
||||
nb_lines = self.skip_head_lines
|
||||
begin_linenb = None
|
||||
last_line_nb = 1
|
||||
|
||||
##
|
||||
# if -F was not used, we can state that begin = 0
|
||||
#
|
||||
# warning: FROM_ID is ignored
|
||||
if FROM_COUNT == 0:
|
||||
self.log.debug('beginning on first line')
|
||||
begin_linenb = 1
|
||||
|
||||
# now read the lines
|
||||
for columns in csv.reader(self.fd, dialect = 'pgloader'):
|
||||
# we count logical lines
|
||||
nb_lines += 1
|
||||
|
||||
##
|
||||
# if -F is used, count lines to skip, and skip them
|
||||
if FROM_COUNT > 0:
|
||||
if nb_lines < FROM_COUNT:
|
||||
continue
|
||||
|
||||
if nb_lines == FROM_COUNT:
|
||||
begin_linenb = nb_lines
|
||||
self.log.info('reached beginning on line %d', nb_lines)
|
||||
|
||||
# check if we already processed COUNT lines
|
||||
if COUNT is not None and begin_linenb is not None \
|
||||
and (nb_lines - begin_linenb + 1) > COUNT:
|
||||
|
||||
self.log.info('reached line %d, stopping', nb_lines)
|
||||
return
|
||||
|
||||
line = self.field_sep.join(columns)
|
||||
offsets = range(last_line_nb, self.fd.line_nb)
|
||||
last_line_nb = self.fd.line_nb
|
||||
|
||||
@ -97,10 +97,14 @@ class db:
|
||||
try:
|
||||
self.log.info('closing current database connection')
|
||||
except IOError, e:
|
||||
# Ignore no space left on device etc here
|
||||
# Ignore no space left on device...
|
||||
pass
|
||||
|
||||
try:
|
||||
self.dbconn.close()
|
||||
except InterfaceError, e:
|
||||
# Ignore connection already closed
|
||||
pass
|
||||
|
||||
self.dbconn.close()
|
||||
self.dbconn = None
|
||||
|
||||
def set_encoding(self):
|
||||
|
||||
@ -63,20 +63,13 @@ class FixedReader(DataReader):
|
||||
self.fd = UnbufferedFileReader(self.filename, self.log,
|
||||
encoding = self.input_encoding,
|
||||
start = self.start,
|
||||
end = self.end)
|
||||
end = self.end,
|
||||
skip_head_lines = self.skip_head_lines)
|
||||
|
||||
# don't forget COUNT and FROM_COUNT option
|
||||
nb_lines = 0
|
||||
nb_lines = self.skip_head_lines
|
||||
begin_linenb = None
|
||||
|
||||
##
|
||||
# if -F was not used, we can state that begin = 0
|
||||
#
|
||||
# warning: FROM_ID is ignored
|
||||
if FROM_COUNT == 0:
|
||||
self.log.debug('beginning on first line')
|
||||
begin_linenb = 1
|
||||
|
||||
for line in self.fd:
|
||||
line = line.strip("\n")
|
||||
llen = len(line)
|
||||
@ -84,16 +77,6 @@ class FixedReader(DataReader):
|
||||
offsets = [self.fd.line_nb]
|
||||
nb_lines += 1
|
||||
|
||||
##
|
||||
# if -F is used, count lines to skip, and skip them
|
||||
if FROM_COUNT > 0:
|
||||
if nb_lines < FROM_COUNT:
|
||||
continue
|
||||
|
||||
if nb_lines == FROM_COUNT:
|
||||
begin_linenb = nb_lines
|
||||
self.log.info('reached beginning on line %d', nb_lines)
|
||||
|
||||
for cname, cpos in self.columns:
|
||||
start, length = self.positions[cname]
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
#
|
||||
# Some common options, for each module to get them
|
||||
|
||||
PGLOADER_VERSION = '2.3.2'
|
||||
PGLOADER_VERSION = '2.3.2~dev1'
|
||||
|
||||
PSYCOPG_VERSION = None
|
||||
|
||||
|
||||
@ -67,6 +67,13 @@ class DataReader:
|
||||
self._getopt('field_sep', config, name, template, FIELD_SEP)
|
||||
self.field_sep = self.field_sep.decode('string-escape')
|
||||
|
||||
##
|
||||
# FROM_COUNT takes precedence over skip_head_lines
|
||||
if FROM_COUNT is None or FROM_COUNT == 0:
|
||||
self._getopt('skip_head_lines', config, name, template, 0, 'int')
|
||||
else:
|
||||
self.skip_head_lines = FROM_COUNT - 1
|
||||
|
||||
if len(self.field_sep) != 1:
|
||||
raise PGLoader_Error, "field_sep must be 1 char, not %d (%s)" \
|
||||
% (len(self.field_sep), self.field_sep)
|
||||
@ -82,6 +89,8 @@ class DataReader:
|
||||
self.log.debug("reader.db %s copy_sep %s" % (self.db, self.db.copy_sep))
|
||||
|
||||
self.log.debug("reader.readconfig field_sep: '%s'", self.field_sep)
|
||||
self.log.debug("reader.readconfig skip_head_lines: %d",
|
||||
self.skip_head_lines)
|
||||
|
||||
def _getopt(self, option, config, section, template, default = None, opt_type = "char"):
|
||||
""" Init given configuration option """
|
||||
@ -138,7 +147,9 @@ class UnbufferedFileReader:
|
||||
|
||||
def __init__(self, filename, log,
|
||||
mode = "rb", encoding = None,
|
||||
start = None, end = None):
|
||||
start = None, end = None,
|
||||
skip_head_lines = 0,
|
||||
check_count = True):
|
||||
""" constructor """
|
||||
self.filename = filename
|
||||
self.log = log
|
||||
@ -150,6 +161,12 @@ class UnbufferedFileReader:
|
||||
self.position = 0
|
||||
self.line_nb = 0
|
||||
|
||||
# check_count can be set to False when phisical lines and logical
|
||||
# lines counts can diverge, like in textreader.py
|
||||
self.check_count = check_count
|
||||
self.skip_head_lines = skip_head_lines
|
||||
self.reading = self.skip_head_lines == 0
|
||||
|
||||
# we don't yet force buffering, but...
|
||||
self.bufsize = -1
|
||||
|
||||
@ -206,12 +223,33 @@ class UnbufferedFileReader:
|
||||
self.line_nb += 1
|
||||
self.position = self.fd.tell()
|
||||
|
||||
##
|
||||
# if -F is used, count lines to skip, and skip them
|
||||
if self.skip_head_lines > 0:
|
||||
if self.line_nb <= self.skip_head_lines:
|
||||
continue
|
||||
|
||||
if self.line_nb == self.skip_head_lines + 1:
|
||||
self.reading = True
|
||||
self.log.info('reached beginning on line %d', self.line_nb)
|
||||
|
||||
|
||||
# check if we already processed COUNT lines
|
||||
if self.check_count:
|
||||
if COUNT is not None and self.reading \
|
||||
and (self.line_nb - self.skip_head_lines + 1) > COUNT:
|
||||
|
||||
self.log.info('reached line %d, stopping', nb_lines)
|
||||
return
|
||||
|
||||
# check EOF (real or multi-readers)
|
||||
if line == '' or last_line_read:
|
||||
self.log.debug("FileReader stoping, offset %d >= %s" \
|
||||
% (self.position, self.end))
|
||||
self.fd.close()
|
||||
return
|
||||
|
||||
|
||||
# check multi-reader boundaries
|
||||
if self.end is not None and self.fd.tell() >= self.end:
|
||||
# we want to process current line and stop at next
|
||||
# iteration
|
||||
|
||||
@ -74,14 +74,16 @@ class TextReader(DataReader):
|
||||
|
||||
##
|
||||
# if neither -I nor -F was used, we can state that begin = 0
|
||||
if FROM_ID is None and FROM_COUNT == 0:
|
||||
if FROM_ID is None and self.skip_head_lines == 0:
|
||||
self.log.debug('beginning on first line')
|
||||
begin_linenb = 1
|
||||
|
||||
self.fd = UnbufferedFileReader(self.filename, self.log,
|
||||
encoding = self.input_encoding,
|
||||
start = self.start,
|
||||
end = self.end)
|
||||
encoding = self.input_encoding,
|
||||
start = self.start,
|
||||
end = self.end,
|
||||
skip_head_lines = self.skip_head_lines,
|
||||
check_count = False)
|
||||
|
||||
for line in self.fd:
|
||||
# we count real physical lines
|
||||
@ -142,16 +144,6 @@ class TextReader(DataReader):
|
||||
if self.start:
|
||||
offsets = (self.start, offsets)
|
||||
|
||||
##
|
||||
# if -F is used, count lines to skip, and skip them
|
||||
if FROM_COUNT > 0:
|
||||
if nb_lines < FROM_COUNT:
|
||||
continue
|
||||
|
||||
if nb_lines == FROM_COUNT:
|
||||
begin_linenb = nb_lines
|
||||
self.log.info('reached beginning on line %d', nb_lines)
|
||||
|
||||
##
|
||||
# check for beginning if option -I was used
|
||||
if FROM_ID is not None:
|
||||
@ -168,7 +160,7 @@ class TextReader(DataReader):
|
||||
# begin is set to 1 when we don't use neither -I nor -F
|
||||
continue
|
||||
|
||||
if COUNT is not None and begin_linenb is not None \
|
||||
if COUNT is not None and self.fd.reading \
|
||||
and (nb_lines - begin_linenb + 1) > COUNT:
|
||||
|
||||
self.log.info('reached line %d, stopping', nb_lines)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user