Support for fixed format datafile

2026-05-04 10:31:02 +02:00 · 2008-05-21 10:33:06 +00:00 · 2008-05-21 10:33:06 +00:00 · b7c7e6a7c1
commit b7c7e6a7c1
parent 2d503ad0fa
9 changed files with 182 additions and 20 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -165,8 +165,7 @@ instead of plain transactions.
 == Fixed Format ==

 Current status::
-   Designs easy and ok, needs to get done sometime, in a minor release
-   between 'big items'
+  CVS, intended for 2.3.1

 Support fixed format: no separator, known length (in bytes) per
 column.
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,10 @@
+pgloader (2.3.1-1) unstable; urgency=low
+
+  * FIX: close database connection as soon as possible
+  * Add support for fixed format
+
+ -- Dimitri Fontaine <dim@tapoueh.org>  Wed, 21 May 2008 12:19:42 +0200
+
 pgloader (2.3.0-1) unstable; urgency=low

  * FIX the cluttered test case, see BUGS.txt
--- a/examples/README
+++ b/examples/README
@ -65,22 +65,39 @@ necessary tables:

 $ for sql in */*sql; do psql pgloader < $sql; done
 $ ../pgloader.py -Tsc pgloader.conf
- 
- [...]
-
- Table name        |    duration |    size |  copy rows |     errors
- ====================================================================
- clob              |      0.043s |   32 kB |          7 |          0
- cluttered         |      0.032s |   32 kB |          6 |          0
- csv               |      0.031s |   16 kB |          6 |          0
- errors            |      0.030s |   32 kB |          4 |          3
- partial           |      0.078s |   32 kB |          7 |          0
- reformat          |      0.018s |   24 kB |          4 |          1
- serial            |      0.024s |   32 kB |          7 |          0
- simple            |      0.024s |   32 kB |          7 |          0
- udc               |      0.018s |   32 kB |          5 |          0
- ====================================================================
- Total             |      0.298s |  264 kB |         53 |          4

+  errors       WARNING  COPY error, trying to find on which line
+  errors       WARNING  COPY data buffer saved in /tmp/errors.AhWvAv.pgloader
+  errors       WARNING  COPY error recovery done (2/3) in 0.064s
+  errors       WARNING  COPY error, trying to find on which line
+  errors       WARNING  COPY data buffer saved in /tmp/errors.BclHtj.pgloader
+  errors       WARNING  COPY error recovery done (1/1) in 0.054s
+  errors       ERROR    3 errors found into [errors] data
+  errors       ERROR    please read /tmp/errors.rej.log for errors log
+  errors       ERROR    and /tmp/errors.rej for data still to process
+  errors       ERROR    3 database errors occured
+  reformat     WARNING  COPY error, trying to find on which line
+  reformat     WARNING  COPY data buffer saved in /tmp/reformat.6P4WCD.pgloader
+  reformat     WARNING  COPY error recovery done (1/4) in 0.034s
+  reformat     ERROR    1 errors found into [reformat] data
+  reformat     ERROR    please read /tmp/reformat.rej.log for errors log
+  reformat     ERROR    and /tmp/reformat.rej for data still to process
+  reformat     ERROR    1 database errors occured
+  
+  Table name        |    duration |    size |  copy rows |     errors
+  ====================================================================
+  allcols           |      0.025s |       - |          8 |          0
+  clob              |      0.034s |       - |          7 |          0
+  cluttered         |      0.061s |       - |          6 |          0
+  csv               |      0.035s |       - |          6 |          0
+  errors            |      0.113s |       - |          4 |          3
+  fixed             |      0.045s |       - |          3 |          0
+  partial           |      0.030s |       - |          7 |          0
+  reformat          |      0.036s |       - |          4 |          1
+  serial            |      0.029s |       - |          7 |          0
+  simple            |      0.050s |       - |          7 |          0
+  udc               |      0.020s |       - |          5 |          0
+  ====================================================================
+  Total             |      0.367s |       - |         64 |          4

 Please note errors test should return 3 errors and reformat 1 error.
--- a/examples/pgloader.conf
+++ b/examples/pgloader.conf
@ -105,6 +105,14 @@ field_sep       = |
 columns         = id, timestamp
 reformat        = timestamp:mysql:timestamp

+[fixed]
+table           = fixed
+format          = fixed
+filename        = fixed/fixed.data
+columns         = *
+fixed_specs     = a:0:10, b:10:8, c:18:8, d:26:17
+reformat        = c:pgtime:time
+
 [csv]
 table        = csv
 format       = csv
--- a/pgloader.1.txt
+++ b/pgloader.1.txt
@ -354,7 +354,7 @@ table::

 format::
 +
-The format data are to be found, either text or csv.
+The format data are to be found, either +text+, +csv+ or +fixed+.
 +
 See next sections for format specific options.

@ -695,6 +695,16 @@ skipinitialspace::
    When +True+, whitespace immediately following the +delimiter+ is
    ignored. The default is +False+.

+== FIXED FORMAT CONFIGURATION PARAMETERS ==
+
+fixed_specs::
+
+This parameter allows to specify start position and byte length for
+each columns to load. Syntax is +column_name:start:len+, separated by
+comas.
+
+    fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
+
 == CONFIGURATION EXAMPLE ==

 Please see the given configuration example which should be distributed in
--- a/pgloader.py
+++ b/pgloader.py
@ -452,7 +452,7 @@ def load_data():
    
    import pgloader.options
    if pgloader.options.REFORMAT_PATH:
-        rpath  = read_path(pgloader.options.REFORMAT_PATH, check = False)
+        rpath  = read_path(pgloader.options.REFORMAT_PATH, log, check = False)
        crpath = check_path(rpath, log)
    else:
        rpath  = crpath  = None
--- a/pgloader/fixedreader.py
+++ b/pgloader/fixedreader.py
@ -0,0 +1,89 @@
+# Author: Dimitri Fontaine <dim@tapoueh.org>
+#
+# pgloader text format reader
+#
+# handles configuration, parse data, then pass them to database module for
+# COPY preparation
+
+import os, sys, os.path, time
+
+from tools    import PGLoader_Error, Reject, parse_config_string
+from db       import db
+from reader   import DataReader, UnbufferedFileReader
+
+from options import DRY_RUN, PEDANTIC
+from options import TRUNCATE, VACUUM
+from options import COUNT, FROM_COUNT, FROM_ID
+from options import INPUT_ENCODING, PG_CLIENT_ENCODING
+from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
+from options import NEWLINE_ESCAPES
+
+class FixedReader(DataReader):
+    """
+    Read fixed file format, configuration gives for each field
+     - field name
+     - start position
+     - length
+    """
+
+    def readconfig(self, config, name, template):
+        """ get this reader module configuration from config file """
+        DataReader.readconfig(self, config, name, template)
+
+        # this will be called twice if templates are in used, so we
+        # have to protect ourselves against removing already read
+        # configurations while in second run.
+
+        self._getopt('fixed_specs', config, name, template, None)
+
+        if self.fixed_specs:
+            self.positions = {}
+            # parse the fixed specs
+            specs = [x.strip().split(':') for x in self.fixed_specs.strip().split(',')]
+            try:
+                for name, start, length in specs:
+                    self.positions[name] = (int(start), int(length))
+                    
+            except ValueError, e:
+                self.log.error("%s.fixed_specs, " + \
+                               "start and length must be numbers", name)
+                
+                raise PGLoader_Error, \
+                      "Please fix %s.fixed_specs configuration" % name
+        else:
+            msg = "section %s: fixed format type require 'fixed_specs'" % name
+            raise PGLoader_Error, msg
+
+        self.log.debug('reader.readconfig: positions %s', self.positions)
+
+    def readlines(self):
+        """ read data from configured file, and generate (yields) for
+        each data line: line, columns and rowid """
+
+        self.fd = UnbufferedFileReader(self.filename, self.log,
+                                       encoding = self.input_encoding,
+                                       start    = self.start,
+                                       end      = self.end)
+
+        line_nb = 0
+
+        for line in self.fd:
+            line_nb += 1
+            line     = line.strip("\n")
+            llen     = len(line)
+            columns  = []
+
+            for cname, cpos in self.columns:
+                start, length = self.positions[cname]
+
+                if llen < (start+length):
+                    self.log.error("Line %d is too short " % line_nb +
+                                   "(column %s requires len >= %d)" \
+                                   % (cname, start+length))
+
+                    msg = "Please review fixed_specs configuration"
+                    raise PGLoader_Error, msg
+                
+                columns.append(line[start:start+length])
+
+            yield line, columns
--- a/pgloader/pgloader.py
+++ b/pgloader/pgloader.py
@ -596,6 +596,17 @@ class PGLoader(threading.Thread):
                                         self.input_encoding,
                                         self.table, self.columns,
                                         self.newline_escapes)
+                
+            elif self.format.lower() == 'fixed':
+                from fixedreader import FixedReader
+                self.reader = FixedReader(self.log, self.db, self.reject,
+                                          self.filename,
+                                          self.input_encoding,
+                                          self.table, self.columns)
+                
+            else:
+                self.log.error("unknown format '%s'")
+                raise PGLoader_Error, "Skipping section %s" % self.name

            self.log.debug('reader.readconfig()')
            self.reader.readconfig(config, name, self.tsection)
--- a/reformat/pgtime.py
+++ b/reformat/pgtime.py
@ -0,0 +1,21 @@
+# Author: Dimitri Fontaine <dim@tapoueh.org>
+#
+# pgloader time-related reformating module
+#
+
+def time(reject, input):
+    """ Reformat str as a PostgreSQL time
+
+    Input time like: 08231560
+    We want instead this input: 08:23:15.60
+    """
+    if len(input) != 8:
+        e = "time reformat input too short: %s" % input
+        reject.log(e, input)
+    
+    hour       = input[0:2]
+    minute     = input[2:4]
+    seconds    = input[4:6]
+    hundredths = input[6:8]
+    
+    return '%s:%s:%s.%s' % (hour, minute, seconds, hundredths)