Implement support for the CVS field_size_limit, new in python 2.5.

This commit is contained in:
Dimitri Fontaine 2010-04-06 16:35:39 +02:00
parent 411a9a53d7
commit 1b6c0f4735
6 changed files with 53 additions and 10 deletions

2
.gitignore vendored
View File

@ -15,3 +15,5 @@
/reformat/pgtime.pyc
/pgloader.1
/pgloader.1.html
/BUGS.html
/TODO.html

3
debian/changelog vendored
View File

@ -1,4 +1,4 @@
pgloader (2.3.3-1) unstable; urgency=low
pgloader (2.3.3~dev1-1) unstable; urgency=low
* Implement -f --field-sep to overwrite the default from command line
* Add support for filename arguments, which use defaults
@ -7,6 +7,7 @@ pgloader (2.3.3-1) unstable; urgency=low
* Support setting any PG option (-o and config file)
* Have --debug show a traceback
* Fix a bug where pgloader would freeze on early error (no such file)
* Implement an option to set csv field size limit
-- Dimitri Fontaine <dim@tapoueh.org> Sun, 4 Apr 2010 19:34:39 +0200

View File

@ -121,12 +121,13 @@ fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
reformat = c:pgtime:time
[csv]
table = csv
format = csv
filename = csv/csv.data
field_sep = ,
quotechar = "
columns = x, y, a, b, d:6, c:5
only_cols = 3-6
skip_head_lines = 1
table = csv
format = csv
filename = csv/csv.data
field_size_limit = 512kB
field_sep = ,
quotechar = "
columns = x, y, a, b, d:6, c:5
only_cols = 3-6
skip_head_lines = 1

View File

@ -777,6 +777,13 @@ skipinitialspace::
When +True+, whitespace immediately following the +delimiter+ is
ignored. The default is +False+.
field_size_limit::
Sets the maximum field size allowed by the python +CSV+ parser. Accepts
an number of bytes (integer), or a string containing a number then one
of those units (case sensitive): +kB+, +MB+, +GB+, +TB+. Requires a at
least python 2.5.
== FIXED FORMAT CONFIGURATION PARAMETERS ==
fixed_specs::

View File

@ -44,8 +44,10 @@ class CSVReader(DataReader):
if self.skipinitialspace is not False:
self.skipinitialspace = self.skipinitialspace == 'True'
self._getopt('field_size_limit', config, name, template, -1, "mem")
for opt in ['doublequote', 'escapechar',
'quotechar', 'skipinitialspace']:
'quotechar', 'skipinitialspace', 'field_size_limit']:
self.log.debug("reader.readconfig %s: '%s'" \
% (opt, self.__dict__[opt]))
@ -78,6 +80,17 @@ class CSVReader(DataReader):
begin_linenb = None
last_line_nb = 1
# set the field_size_limit, from python 2.5
if self.field_size_limit != -1:
try:
csv.field_size_limit(self.field_size_limit)
self.log.debug("csv.field_size_limit(%d)" \
% self.field_size_limit)
except AttributeError:
#'module' object has no attribute 'field_size_limit'
self.log.warning("field_size_limit is new in python version 2.5")
pass
# now read the lines
for columns in csv.reader(self.fd, dialect = 'pgloader'):
# we count logical lines

View File

@ -32,6 +32,10 @@ class DataReader:
self.table = table
self.columns = columns
self.reject = reject
self.mem_units = {'kB': 1024,
'MB': 1024*1024,
'GB': 1024*1024*1024,
'TB': 1024*1024*1024*1024}
if self.input_encoding is None:
if INPUT_ENCODING is not None:
@ -117,6 +121,21 @@ class DataReader:
% (section, option, self.__dict__[option]))
raise PGLoader_Error, e
elif opt_type == 'mem' and self.__dict__[option] is not None:
try:
opt = self.__dict__[option]
if type(opt) == type("string") \
and len(opt) > 2 and opt [-2:] in self.mem_units:
unit = opt[-2:]
size = int(opt[:-2]) * self.mem_units[unit]
self.__dict__[option] = int(size)
else:
self.__dict__[option] = int(self.__dict__[option])
except ValueError, e:
self.log.error('Configuration option %s.%s is not a memsize: %s' \
% (section, option, self.__dict__[option]))
raise PGLoader_Error, e
return self.__dict__[option]
def readlines(self):