From b7c7e6a7c19e235d0abf0a4d16c5469e52c651a5 Mon Sep 17 00:00:00 2001 From: dim Date: Wed, 21 May 2008 10:33:06 +0000 Subject: [PATCH] Support for fixed format datafile --- TODO.txt | 3 +- debian/changelog | 7 ++++ examples/README | 49 +++++++++++++++-------- examples/pgloader.conf | 8 ++++ pgloader.1.txt | 12 +++++- pgloader.py | 2 +- pgloader/fixedreader.py | 89 +++++++++++++++++++++++++++++++++++++++++ pgloader/pgloader.py | 11 +++++ reformat/pgtime.py | 21 ++++++++++ 9 files changed, 182 insertions(+), 20 deletions(-) create mode 100644 pgloader/fixedreader.py create mode 100644 reformat/pgtime.py diff --git a/TODO.txt b/TODO.txt index 192dee1..451a318 100644 --- a/TODO.txt +++ b/TODO.txt @@ -165,8 +165,7 @@ instead of plain transactions. == Fixed Format == Current status:: - Designs easy and ok, needs to get done sometime, in a minor release - between 'big items' + CVS, intended for 2.3.1 Support fixed format: no separator, known length (in bytes) per column. diff --git a/debian/changelog b/debian/changelog index 6b3660c..80dcea1 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +pgloader (2.3.1-1) unstable; urgency=low + + * FIX: close database connection as soon as possible + * Add support for fixed format + + -- Dimitri Fontaine Wed, 21 May 2008 12:19:42 +0200 + pgloader (2.3.0-1) unstable; urgency=low * FIX the cluttered test case, see BUGS.txt diff --git a/examples/README b/examples/README index 0010d73..00104ac 100644 --- a/examples/README +++ b/examples/README @@ -65,22 +65,39 @@ necessary tables: $ for sql in */*sql; do psql pgloader < $sql; done $ ../pgloader.py -Tsc pgloader.conf - - [...] - - Table name | duration | size | copy rows | errors - ==================================================================== - clob | 0.043s | 32 kB | 7 | 0 - cluttered | 0.032s | 32 kB | 6 | 0 - csv | 0.031s | 16 kB | 6 | 0 - errors | 0.030s | 32 kB | 4 | 3 - partial | 0.078s | 32 kB | 7 | 0 - reformat | 0.018s | 24 kB | 4 | 1 - serial | 0.024s | 32 kB | 7 | 0 - simple | 0.024s | 32 kB | 7 | 0 - udc | 0.018s | 32 kB | 5 | 0 - ==================================================================== - Total | 0.298s | 264 kB | 53 | 4 + errors WARNING COPY error, trying to find on which line + errors WARNING COPY data buffer saved in /tmp/errors.AhWvAv.pgloader + errors WARNING COPY error recovery done (2/3) in 0.064s + errors WARNING COPY error, trying to find on which line + errors WARNING COPY data buffer saved in /tmp/errors.BclHtj.pgloader + errors WARNING COPY error recovery done (1/1) in 0.054s + errors ERROR 3 errors found into [errors] data + errors ERROR please read /tmp/errors.rej.log for errors log + errors ERROR and /tmp/errors.rej for data still to process + errors ERROR 3 database errors occured + reformat WARNING COPY error, trying to find on which line + reformat WARNING COPY data buffer saved in /tmp/reformat.6P4WCD.pgloader + reformat WARNING COPY error recovery done (1/4) in 0.034s + reformat ERROR 1 errors found into [reformat] data + reformat ERROR please read /tmp/reformat.rej.log for errors log + reformat ERROR and /tmp/reformat.rej for data still to process + reformat ERROR 1 database errors occured + + Table name | duration | size | copy rows | errors + ==================================================================== + allcols | 0.025s | - | 8 | 0 + clob | 0.034s | - | 7 | 0 + cluttered | 0.061s | - | 6 | 0 + csv | 0.035s | - | 6 | 0 + errors | 0.113s | - | 4 | 3 + fixed | 0.045s | - | 3 | 0 + partial | 0.030s | - | 7 | 0 + reformat | 0.036s | - | 4 | 1 + serial | 0.029s | - | 7 | 0 + simple | 0.050s | - | 7 | 0 + udc | 0.020s | - | 5 | 0 + ==================================================================== + Total | 0.367s | - | 64 | 4 Please note errors test should return 3 errors and reformat 1 error. diff --git a/examples/pgloader.conf b/examples/pgloader.conf index 4a5a828..4e31b5f 100644 --- a/examples/pgloader.conf +++ b/examples/pgloader.conf @@ -105,6 +105,14 @@ field_sep = | columns = id, timestamp reformat = timestamp:mysql:timestamp +[fixed] +table = fixed +format = fixed +filename = fixed/fixed.data +columns = * +fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17 +reformat = c:pgtime:time + [csv] table = csv format = csv diff --git a/pgloader.1.txt b/pgloader.1.txt index 0c79fc6..878ffc9 100644 --- a/pgloader.1.txt +++ b/pgloader.1.txt @@ -354,7 +354,7 @@ table:: format:: + -The format data are to be found, either text or csv. +The format data are to be found, either +text+, +csv+ or +fixed+. + See next sections for format specific options. @@ -695,6 +695,16 @@ skipinitialspace:: When +True+, whitespace immediately following the +delimiter+ is ignored. The default is +False+. +== FIXED FORMAT CONFIGURATION PARAMETERS == + +fixed_specs:: ++ +This parameter allows to specify start position and byte length for +each columns to load. Syntax is +column_name:start:len+, separated by +comas. ++ + fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17 + == CONFIGURATION EXAMPLE == Please see the given configuration example which should be distributed in diff --git a/pgloader.py b/pgloader.py index ee94496..7d70e2e 100644 --- a/pgloader.py +++ b/pgloader.py @@ -452,7 +452,7 @@ def load_data(): import pgloader.options if pgloader.options.REFORMAT_PATH: - rpath = read_path(pgloader.options.REFORMAT_PATH, check = False) + rpath = read_path(pgloader.options.REFORMAT_PATH, log, check = False) crpath = check_path(rpath, log) else: rpath = crpath = None diff --git a/pgloader/fixedreader.py b/pgloader/fixedreader.py new file mode 100644 index 0000000..06d9fa8 --- /dev/null +++ b/pgloader/fixedreader.py @@ -0,0 +1,89 @@ +# Author: Dimitri Fontaine +# +# pgloader text format reader +# +# handles configuration, parse data, then pass them to database module for +# COPY preparation + +import os, sys, os.path, time + +from tools import PGLoader_Error, Reject, parse_config_string +from db import db +from reader import DataReader, UnbufferedFileReader + +from options import DRY_RUN, PEDANTIC +from options import TRUNCATE, VACUUM +from options import COUNT, FROM_COUNT, FROM_ID +from options import INPUT_ENCODING, PG_CLIENT_ENCODING +from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING +from options import NEWLINE_ESCAPES + +class FixedReader(DataReader): + """ + Read fixed file format, configuration gives for each field + - field name + - start position + - length + """ + + def readconfig(self, config, name, template): + """ get this reader module configuration from config file """ + DataReader.readconfig(self, config, name, template) + + # this will be called twice if templates are in used, so we + # have to protect ourselves against removing already read + # configurations while in second run. + + self._getopt('fixed_specs', config, name, template, None) + + if self.fixed_specs: + self.positions = {} + # parse the fixed specs + specs = [x.strip().split(':') for x in self.fixed_specs.strip().split(',')] + try: + for name, start, length in specs: + self.positions[name] = (int(start), int(length)) + + except ValueError, e: + self.log.error("%s.fixed_specs, " + \ + "start and length must be numbers", name) + + raise PGLoader_Error, \ + "Please fix %s.fixed_specs configuration" % name + else: + msg = "section %s: fixed format type require 'fixed_specs'" % name + raise PGLoader_Error, msg + + self.log.debug('reader.readconfig: positions %s', self.positions) + + def readlines(self): + """ read data from configured file, and generate (yields) for + each data line: line, columns and rowid """ + + self.fd = UnbufferedFileReader(self.filename, self.log, + encoding = self.input_encoding, + start = self.start, + end = self.end) + + line_nb = 0 + + for line in self.fd: + line_nb += 1 + line = line.strip("\n") + llen = len(line) + columns = [] + + for cname, cpos in self.columns: + start, length = self.positions[cname] + + if llen < (start+length): + self.log.error("Line %d is too short " % line_nb + + "(column %s requires len >= %d)" \ + % (cname, start+length)) + + msg = "Please review fixed_specs configuration" + raise PGLoader_Error, msg + + columns.append(line[start:start+length]) + + yield line, columns diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index 3ce9a50..6c8adc6 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -596,6 +596,17 @@ class PGLoader(threading.Thread): self.input_encoding, self.table, self.columns, self.newline_escapes) + + elif self.format.lower() == 'fixed': + from fixedreader import FixedReader + self.reader = FixedReader(self.log, self.db, self.reject, + self.filename, + self.input_encoding, + self.table, self.columns) + + else: + self.log.error("unknown format '%s'") + raise PGLoader_Error, "Skipping section %s" % self.name self.log.debug('reader.readconfig()') self.reader.readconfig(config, name, self.tsection) diff --git a/reformat/pgtime.py b/reformat/pgtime.py new file mode 100644 index 0000000..cab79f5 --- /dev/null +++ b/reformat/pgtime.py @@ -0,0 +1,21 @@ +# Author: Dimitri Fontaine +# +# pgloader time-related reformating module +# + +def time(reject, input): + """ Reformat str as a PostgreSQL time + + Input time like: 08231560 + We want instead this input: 08:23:15.60 + """ + if len(input) != 8: + e = "time reformat input too short: %s" % input + reject.log(e, input) + + hour = input[0:2] + minute = input[2:4] + seconds = input[4:6] + hundredths = input[6:8] + + return '%s:%s:%s.%s' % (hour, minute, seconds, hundredths)