Support for fixed format datafile

This commit is contained in:
dim 2008-05-21 10:33:06 +00:00
parent 2d503ad0fa
commit b7c7e6a7c1
9 changed files with 182 additions and 20 deletions

View File

@ -165,8 +165,7 @@ instead of plain transactions.
== Fixed Format ==
Current status::
Designs easy and ok, needs to get done sometime, in a minor release
between 'big items'
CVS, intended for 2.3.1
Support fixed format: no separator, known length (in bytes) per
column.

7
debian/changelog vendored
View File

@ -1,3 +1,10 @@
pgloader (2.3.1-1) unstable; urgency=low
* FIX: close database connection as soon as possible
* Add support for fixed format
-- Dimitri Fontaine <dim@tapoueh.org> Wed, 21 May 2008 12:19:42 +0200
pgloader (2.3.0-1) unstable; urgency=low
* FIX the cluttered test case, see BUGS.txt

View File

@ -65,22 +65,39 @@ necessary tables:
$ for sql in */*sql; do psql pgloader < $sql; done
$ ../pgloader.py -Tsc pgloader.conf
[...]
Table name | duration | size | copy rows | errors
====================================================================
clob | 0.043s | 32 kB | 7 | 0
cluttered | 0.032s | 32 kB | 6 | 0
csv | 0.031s | 16 kB | 6 | 0
errors | 0.030s | 32 kB | 4 | 3
partial | 0.078s | 32 kB | 7 | 0
reformat | 0.018s | 24 kB | 4 | 1
serial | 0.024s | 32 kB | 7 | 0
simple | 0.024s | 32 kB | 7 | 0
udc | 0.018s | 32 kB | 5 | 0
====================================================================
Total | 0.298s | 264 kB | 53 | 4
errors WARNING COPY error, trying to find on which line
errors WARNING COPY data buffer saved in /tmp/errors.AhWvAv.pgloader
errors WARNING COPY error recovery done (2/3) in 0.064s
errors WARNING COPY error, trying to find on which line
errors WARNING COPY data buffer saved in /tmp/errors.BclHtj.pgloader
errors WARNING COPY error recovery done (1/1) in 0.054s
errors ERROR 3 errors found into [errors] data
errors ERROR please read /tmp/errors.rej.log for errors log
errors ERROR and /tmp/errors.rej for data still to process
errors ERROR 3 database errors occured
reformat WARNING COPY error, trying to find on which line
reformat WARNING COPY data buffer saved in /tmp/reformat.6P4WCD.pgloader
reformat WARNING COPY error recovery done (1/4) in 0.034s
reformat ERROR 1 errors found into [reformat] data
reformat ERROR please read /tmp/reformat.rej.log for errors log
reformat ERROR and /tmp/reformat.rej for data still to process
reformat ERROR 1 database errors occured
Table name | duration | size | copy rows | errors
====================================================================
allcols | 0.025s | - | 8 | 0
clob | 0.034s | - | 7 | 0
cluttered | 0.061s | - | 6 | 0
csv | 0.035s | - | 6 | 0
errors | 0.113s | - | 4 | 3
fixed | 0.045s | - | 3 | 0
partial | 0.030s | - | 7 | 0
reformat | 0.036s | - | 4 | 1
serial | 0.029s | - | 7 | 0
simple | 0.050s | - | 7 | 0
udc | 0.020s | - | 5 | 0
====================================================================
Total | 0.367s | - | 64 | 4
Please note errors test should return 3 errors and reformat 1 error.

View File

@ -105,6 +105,14 @@ field_sep = |
columns = id, timestamp
reformat = timestamp:mysql:timestamp
[fixed]
table = fixed
format = fixed
filename = fixed/fixed.data
columns = *
fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
reformat = c:pgtime:time
[csv]
table = csv
format = csv

View File

@ -354,7 +354,7 @@ table::
format::
+
The format data are to be found, either text or csv.
The format data are to be found, either +text+, +csv+ or +fixed+.
+
See next sections for format specific options.
@ -695,6 +695,16 @@ skipinitialspace::
When +True+, whitespace immediately following the +delimiter+ is
ignored. The default is +False+.
== FIXED FORMAT CONFIGURATION PARAMETERS ==
fixed_specs::
+
This parameter allows to specify start position and byte length for
each columns to load. Syntax is +column_name:start:len+, separated by
comas.
+
fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
== CONFIGURATION EXAMPLE ==
Please see the given configuration example which should be distributed in

View File

@ -452,7 +452,7 @@ def load_data():
import pgloader.options
if pgloader.options.REFORMAT_PATH:
rpath = read_path(pgloader.options.REFORMAT_PATH, check = False)
rpath = read_path(pgloader.options.REFORMAT_PATH, log, check = False)
crpath = check_path(rpath, log)
else:
rpath = crpath = None

89
pgloader/fixedreader.py Normal file
View File

@ -0,0 +1,89 @@
# Author: Dimitri Fontaine <dim@tapoueh.org>
#
# pgloader text format reader
#
# handles configuration, parse data, then pass them to database module for
# COPY preparation
import os, sys, os.path, time
from tools import PGLoader_Error, Reject, parse_config_string
from db import db
from reader import DataReader, UnbufferedFileReader
from options import DRY_RUN, PEDANTIC
from options import TRUNCATE, VACUUM
from options import COUNT, FROM_COUNT, FROM_ID
from options import INPUT_ENCODING, PG_CLIENT_ENCODING
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
from options import NEWLINE_ESCAPES
class FixedReader(DataReader):
"""
Read fixed file format, configuration gives for each field
- field name
- start position
- length
"""
def readconfig(self, config, name, template):
""" get this reader module configuration from config file """
DataReader.readconfig(self, config, name, template)
# this will be called twice if templates are in used, so we
# have to protect ourselves against removing already read
# configurations while in second run.
self._getopt('fixed_specs', config, name, template, None)
if self.fixed_specs:
self.positions = {}
# parse the fixed specs
specs = [x.strip().split(':') for x in self.fixed_specs.strip().split(',')]
try:
for name, start, length in specs:
self.positions[name] = (int(start), int(length))
except ValueError, e:
self.log.error("%s.fixed_specs, " + \
"start and length must be numbers", name)
raise PGLoader_Error, \
"Please fix %s.fixed_specs configuration" % name
else:
msg = "section %s: fixed format type require 'fixed_specs'" % name
raise PGLoader_Error, msg
self.log.debug('reader.readconfig: positions %s', self.positions)
def readlines(self):
""" read data from configured file, and generate (yields) for
each data line: line, columns and rowid """
self.fd = UnbufferedFileReader(self.filename, self.log,
encoding = self.input_encoding,
start = self.start,
end = self.end)
line_nb = 0
for line in self.fd:
line_nb += 1
line = line.strip("\n")
llen = len(line)
columns = []
for cname, cpos in self.columns:
start, length = self.positions[cname]
if llen < (start+length):
self.log.error("Line %d is too short " % line_nb +
"(column %s requires len >= %d)" \
% (cname, start+length))
msg = "Please review fixed_specs configuration"
raise PGLoader_Error, msg
columns.append(line[start:start+length])
yield line, columns

View File

@ -596,6 +596,17 @@ class PGLoader(threading.Thread):
self.input_encoding,
self.table, self.columns,
self.newline_escapes)
elif self.format.lower() == 'fixed':
from fixedreader import FixedReader
self.reader = FixedReader(self.log, self.db, self.reject,
self.filename,
self.input_encoding,
self.table, self.columns)
else:
self.log.error("unknown format '%s'")
raise PGLoader_Error, "Skipping section %s" % self.name
self.log.debug('reader.readconfig()')
self.reader.readconfig(config, name, self.tsection)

21
reformat/pgtime.py Normal file
View File

@ -0,0 +1,21 @@
# Author: Dimitri Fontaine <dim@tapoueh.org>
#
# pgloader time-related reformating module
#
def time(reject, input):
""" Reformat str as a PostgreSQL time
Input time like: 08231560
We want instead this input: 08:23:15.60
"""
if len(input) != 8:
e = "time reformat input too short: %s" % input
reject.log(e, input)
hour = input[0:2]
minute = input[2:4]
seconds = input[4:6]
hundredths = input[6:8]
return '%s:%s:%s.%s' % (hour, minute, seconds, hundredths)