From 1b6c0f4735df94768c591ef441412cc137d75690 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 6 Apr 2010 16:35:39 +0200 Subject: [PATCH] Implement support for the CVS field_size_limit, new in python 2.5. --- .gitignore | 2 ++ debian/changelog | 3 ++- examples/pgloader.conf | 17 +++++++++-------- pgloader.1.txt | 7 +++++++ pgloader/csvreader.py | 15 ++++++++++++++- pgloader/reader.py | 19 +++++++++++++++++++ 6 files changed, 53 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 5c80b3c..40db39e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ /reformat/pgtime.pyc /pgloader.1 /pgloader.1.html +/BUGS.html +/TODO.html diff --git a/debian/changelog b/debian/changelog index c6fda74..370d57f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -pgloader (2.3.3-1) unstable; urgency=low +pgloader (2.3.3~dev1-1) unstable; urgency=low * Implement -f --field-sep to overwrite the default from command line * Add support for filename arguments, which use defaults @@ -7,6 +7,7 @@ pgloader (2.3.3-1) unstable; urgency=low * Support setting any PG option (-o and config file) * Have --debug show a traceback * Fix a bug where pgloader would freeze on early error (no such file) + * Implement an option to set csv field size limit -- Dimitri Fontaine Sun, 4 Apr 2010 19:34:39 +0200 diff --git a/examples/pgloader.conf b/examples/pgloader.conf index edf865e..fe07651 100644 --- a/examples/pgloader.conf +++ b/examples/pgloader.conf @@ -121,12 +121,13 @@ fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17 reformat = c:pgtime:time [csv] -table = csv -format = csv -filename = csv/csv.data -field_sep = , -quotechar = " -columns = x, y, a, b, d:6, c:5 -only_cols = 3-6 -skip_head_lines = 1 +table = csv +format = csv +filename = csv/csv.data +field_size_limit = 512kB +field_sep = , +quotechar = " +columns = x, y, a, b, d:6, c:5 +only_cols = 3-6 +skip_head_lines = 1 diff --git a/pgloader.1.txt b/pgloader.1.txt index 9a1607c..bcca478 100644 --- a/pgloader.1.txt +++ b/pgloader.1.txt @@ -777,6 +777,13 @@ skipinitialspace:: When +True+, whitespace immediately following the +delimiter+ is ignored. The default is +False+. +field_size_limit:: + + Sets the maximum field size allowed by the python +CSV+ parser. Accepts + an number of bytes (integer), or a string containing a number then one + of those units (case sensitive): +kB+, +MB+, +GB+, +TB+. Requires a at + least python 2.5. + == FIXED FORMAT CONFIGURATION PARAMETERS == fixed_specs:: diff --git a/pgloader/csvreader.py b/pgloader/csvreader.py index 42f692c..730fba0 100644 --- a/pgloader/csvreader.py +++ b/pgloader/csvreader.py @@ -44,8 +44,10 @@ class CSVReader(DataReader): if self.skipinitialspace is not False: self.skipinitialspace = self.skipinitialspace == 'True' + self._getopt('field_size_limit', config, name, template, -1, "mem") + for opt in ['doublequote', 'escapechar', - 'quotechar', 'skipinitialspace']: + 'quotechar', 'skipinitialspace', 'field_size_limit']: self.log.debug("reader.readconfig %s: '%s'" \ % (opt, self.__dict__[opt])) @@ -78,6 +80,17 @@ class CSVReader(DataReader): begin_linenb = None last_line_nb = 1 + # set the field_size_limit, from python 2.5 + if self.field_size_limit != -1: + try: + csv.field_size_limit(self.field_size_limit) + self.log.debug("csv.field_size_limit(%d)" \ + % self.field_size_limit) + except AttributeError: + #'module' object has no attribute 'field_size_limit' + self.log.warning("field_size_limit is new in python version 2.5") + pass + # now read the lines for columns in csv.reader(self.fd, dialect = 'pgloader'): # we count logical lines diff --git a/pgloader/reader.py b/pgloader/reader.py index 4b8bcaf..bcef947 100644 --- a/pgloader/reader.py +++ b/pgloader/reader.py @@ -32,6 +32,10 @@ class DataReader: self.table = table self.columns = columns self.reject = reject + self.mem_units = {'kB': 1024, + 'MB': 1024*1024, + 'GB': 1024*1024*1024, + 'TB': 1024*1024*1024*1024} if self.input_encoding is None: if INPUT_ENCODING is not None: @@ -117,6 +121,21 @@ class DataReader: % (section, option, self.__dict__[option])) raise PGLoader_Error, e + elif opt_type == 'mem' and self.__dict__[option] is not None: + try: + opt = self.__dict__[option] + if type(opt) == type("string") \ + and len(opt) > 2 and opt [-2:] in self.mem_units: + unit = opt[-2:] + size = int(opt[:-2]) * self.mem_units[unit] + self.__dict__[option] = int(size) + else: + self.__dict__[option] = int(self.__dict__[option]) + except ValueError, e: + self.log.error('Configuration option %s.%s is not a memsize: %s' \ + % (section, option, self.__dict__[option])) + raise PGLoader_Error, e + return self.__dict__[option] def readlines(self):