Implement support for the CVS field_size_limit, new in python 2.5.

2026-05-05 10:56:10 +02:00 · 2010-04-06 16:35:39 +02:00 · 2010-04-06 16:35:39 +02:00 · 1b6c0f4735
commit 1b6c0f4735
parent 411a9a53d7
6 changed files with 53 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,5 @@
 /reformat/pgtime.pyc
 /pgloader.1
 /pgloader.1.html
+/BUGS.html
+/TODO.html
--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-pgloader (2.3.3-1) unstable; urgency=low
+pgloader (2.3.3~dev1-1) unstable; urgency=low

  * Implement -f --field-sep to overwrite the default from command line
  * Add support for filename arguments, which use defaults
@ -7,6 +7,7 @@ pgloader (2.3.3-1) unstable; urgency=low
  * Support setting any PG option (-o and config file)
  * Have --debug show a traceback
  * Fix a bug where pgloader would freeze on early error (no such file)
+  * Implement an option to set csv field size limit

 -- Dimitri Fontaine <dim@tapoueh.org>  Sun,  4 Apr 2010 19:34:39 +0200

--- a/examples/pgloader.conf
+++ b/examples/pgloader.conf
@ -121,12 +121,13 @@ fixed_specs     = a:0:10, b:10:8, c:18:8, d:26:17
 reformat        = c:pgtime:time

 [csv]
-table           = csv
-format          = csv
-filename        = csv/csv.data
-field_sep       = ,
-quotechar       = "
-columns         = x, y, a, b, d:6, c:5
-only_cols       = 3-6
-skip_head_lines = 1
+table            = csv
+format           = csv
+filename         = csv/csv.data
+field_size_limit = 512kB
+field_sep        = ,
+quotechar        = "
+columns          = x, y, a, b, d:6, c:5
+only_cols        = 3-6
+skip_head_lines  = 1

--- a/pgloader.1.txt
+++ b/pgloader.1.txt
@ -777,6 +777,13 @@ skipinitialspace::
    When +True+, whitespace immediately following the +delimiter+ is
    ignored. The default is +False+.

+field_size_limit::
+
+    Sets the maximum field size allowed by the python +CSV+ parser. Accepts
+    an number of bytes (integer), or a string containing a number then one
+    of those units (case sensitive): +kB+, +MB+, +GB+, +TB+. Requires a at
+    least python 2.5.
+
 == FIXED FORMAT CONFIGURATION PARAMETERS ==

 fixed_specs::
--- a/pgloader/csvreader.py
+++ b/pgloader/csvreader.py
@ -44,8 +44,10 @@ class CSVReader(DataReader):
        if self.skipinitialspace is not False:
            self.skipinitialspace = self.skipinitialspace == 'True'

+        self._getopt('field_size_limit', config, name, template, -1, "mem")
+
        for opt in ['doublequote', 'escapechar',
-                    'quotechar', 'skipinitialspace']:
+                    'quotechar', 'skipinitialspace', 'field_size_limit']:
            
            self.log.debug("reader.readconfig %s: '%s'" \
                           % (opt, self.__dict__[opt]))
@ -78,6 +80,17 @@ class CSVReader(DataReader):
        begin_linenb = None
        last_line_nb = 1

+        # set the field_size_limit, from python 2.5
+        if self.field_size_limit != -1:
+            try:
+                csv.field_size_limit(self.field_size_limit)
+                self.log.debug("csv.field_size_limit(%d)" \
+                               % self.field_size_limit)
+            except AttributeError:
+                #'module' object has no attribute 'field_size_limit'
+                self.log.warning("field_size_limit is new in python version 2.5")
+                pass
+
        # now read the lines
        for columns in csv.reader(self.fd, dialect = 'pgloader'):
            # we count logical lines
--- a/pgloader/reader.py
+++ b/pgloader/reader.py
@ -32,6 +32,10 @@ class DataReader:
        self.table     = table
        self.columns   = columns
        self.reject    = reject
+        self.mem_units = {'kB': 1024,
+                          'MB': 1024*1024,
+                          'GB': 1024*1024*1024,
+                          'TB': 1024*1024*1024*1024}

        if self.input_encoding is None:
            if INPUT_ENCODING is not None:
@ -117,6 +121,21 @@ class DataReader:
                               % (section, option, self.__dict__[option]))
                raise PGLoader_Error, e

+        elif opt_type == 'mem' and self.__dict__[option] is not None:
+            try:
+                opt = self.__dict__[option]
+                if type(opt) == type("string") \
+                       and len(opt) > 2 and opt [-2:] in self.mem_units:
+                    unit = opt[-2:]
+                    size = int(opt[:-2]) * self.mem_units[unit]
+                    self.__dict__[option] = int(size)
+                else:
+                    self.__dict__[option] = int(self.__dict__[option])
+            except ValueError, e:
+                self.log.error('Configuration option %s.%s is not a memsize: %s' \
+                               % (section, option, self.__dict__[option]))
+                raise PGLoader_Error, e
+
        return self.__dict__[option]

    def readlines(self):