mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-04 10:31:02 +02:00
Support for fixed format datafile
This commit is contained in:
parent
2d503ad0fa
commit
b7c7e6a7c1
3
TODO.txt
3
TODO.txt
@ -165,8 +165,7 @@ instead of plain transactions.
|
||||
== Fixed Format ==
|
||||
|
||||
Current status::
|
||||
Designs easy and ok, needs to get done sometime, in a minor release
|
||||
between 'big items'
|
||||
CVS, intended for 2.3.1
|
||||
|
||||
Support fixed format: no separator, known length (in bytes) per
|
||||
column.
|
||||
|
||||
7
debian/changelog
vendored
7
debian/changelog
vendored
@ -1,3 +1,10 @@
|
||||
pgloader (2.3.1-1) unstable; urgency=low
|
||||
|
||||
* FIX: close database connection as soon as possible
|
||||
* Add support for fixed format
|
||||
|
||||
-- Dimitri Fontaine <dim@tapoueh.org> Wed, 21 May 2008 12:19:42 +0200
|
||||
|
||||
pgloader (2.3.0-1) unstable; urgency=low
|
||||
|
||||
* FIX the cluttered test case, see BUGS.txt
|
||||
|
||||
@ -65,22 +65,39 @@ necessary tables:
|
||||
|
||||
$ for sql in */*sql; do psql pgloader < $sql; done
|
||||
$ ../pgloader.py -Tsc pgloader.conf
|
||||
|
||||
[...]
|
||||
|
||||
Table name | duration | size | copy rows | errors
|
||||
====================================================================
|
||||
clob | 0.043s | 32 kB | 7 | 0
|
||||
cluttered | 0.032s | 32 kB | 6 | 0
|
||||
csv | 0.031s | 16 kB | 6 | 0
|
||||
errors | 0.030s | 32 kB | 4 | 3
|
||||
partial | 0.078s | 32 kB | 7 | 0
|
||||
reformat | 0.018s | 24 kB | 4 | 1
|
||||
serial | 0.024s | 32 kB | 7 | 0
|
||||
simple | 0.024s | 32 kB | 7 | 0
|
||||
udc | 0.018s | 32 kB | 5 | 0
|
||||
====================================================================
|
||||
Total | 0.298s | 264 kB | 53 | 4
|
||||
|
||||
errors WARNING COPY error, trying to find on which line
|
||||
errors WARNING COPY data buffer saved in /tmp/errors.AhWvAv.pgloader
|
||||
errors WARNING COPY error recovery done (2/3) in 0.064s
|
||||
errors WARNING COPY error, trying to find on which line
|
||||
errors WARNING COPY data buffer saved in /tmp/errors.BclHtj.pgloader
|
||||
errors WARNING COPY error recovery done (1/1) in 0.054s
|
||||
errors ERROR 3 errors found into [errors] data
|
||||
errors ERROR please read /tmp/errors.rej.log for errors log
|
||||
errors ERROR and /tmp/errors.rej for data still to process
|
||||
errors ERROR 3 database errors occured
|
||||
reformat WARNING COPY error, trying to find on which line
|
||||
reformat WARNING COPY data buffer saved in /tmp/reformat.6P4WCD.pgloader
|
||||
reformat WARNING COPY error recovery done (1/4) in 0.034s
|
||||
reformat ERROR 1 errors found into [reformat] data
|
||||
reformat ERROR please read /tmp/reformat.rej.log for errors log
|
||||
reformat ERROR and /tmp/reformat.rej for data still to process
|
||||
reformat ERROR 1 database errors occured
|
||||
|
||||
Table name | duration | size | copy rows | errors
|
||||
====================================================================
|
||||
allcols | 0.025s | - | 8 | 0
|
||||
clob | 0.034s | - | 7 | 0
|
||||
cluttered | 0.061s | - | 6 | 0
|
||||
csv | 0.035s | - | 6 | 0
|
||||
errors | 0.113s | - | 4 | 3
|
||||
fixed | 0.045s | - | 3 | 0
|
||||
partial | 0.030s | - | 7 | 0
|
||||
reformat | 0.036s | - | 4 | 1
|
||||
serial | 0.029s | - | 7 | 0
|
||||
simple | 0.050s | - | 7 | 0
|
||||
udc | 0.020s | - | 5 | 0
|
||||
====================================================================
|
||||
Total | 0.367s | - | 64 | 4
|
||||
|
||||
Please note errors test should return 3 errors and reformat 1 error.
|
||||
|
||||
@ -105,6 +105,14 @@ field_sep = |
|
||||
columns = id, timestamp
|
||||
reformat = timestamp:mysql:timestamp
|
||||
|
||||
[fixed]
|
||||
table = fixed
|
||||
format = fixed
|
||||
filename = fixed/fixed.data
|
||||
columns = *
|
||||
fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
|
||||
reformat = c:pgtime:time
|
||||
|
||||
[csv]
|
||||
table = csv
|
||||
format = csv
|
||||
|
||||
@ -354,7 +354,7 @@ table::
|
||||
|
||||
format::
|
||||
+
|
||||
The format data are to be found, either text or csv.
|
||||
The format data are to be found, either +text+, +csv+ or +fixed+.
|
||||
+
|
||||
See next sections for format specific options.
|
||||
|
||||
@ -695,6 +695,16 @@ skipinitialspace::
|
||||
When +True+, whitespace immediately following the +delimiter+ is
|
||||
ignored. The default is +False+.
|
||||
|
||||
== FIXED FORMAT CONFIGURATION PARAMETERS ==
|
||||
|
||||
fixed_specs::
|
||||
+
|
||||
This parameter allows to specify start position and byte length for
|
||||
each columns to load. Syntax is +column_name:start:len+, separated by
|
||||
comas.
|
||||
+
|
||||
fixed_specs = a:0:10, b:10:8, c:18:8, d:26:17
|
||||
|
||||
== CONFIGURATION EXAMPLE ==
|
||||
|
||||
Please see the given configuration example which should be distributed in
|
||||
|
||||
@ -452,7 +452,7 @@ def load_data():
|
||||
|
||||
import pgloader.options
|
||||
if pgloader.options.REFORMAT_PATH:
|
||||
rpath = read_path(pgloader.options.REFORMAT_PATH, check = False)
|
||||
rpath = read_path(pgloader.options.REFORMAT_PATH, log, check = False)
|
||||
crpath = check_path(rpath, log)
|
||||
else:
|
||||
rpath = crpath = None
|
||||
|
||||
89
pgloader/fixedreader.py
Normal file
89
pgloader/fixedreader.py
Normal file
@ -0,0 +1,89 @@
|
||||
# Author: Dimitri Fontaine <dim@tapoueh.org>
|
||||
#
|
||||
# pgloader text format reader
|
||||
#
|
||||
# handles configuration, parse data, then pass them to database module for
|
||||
# COPY preparation
|
||||
|
||||
import os, sys, os.path, time
|
||||
|
||||
from tools import PGLoader_Error, Reject, parse_config_string
|
||||
from db import db
|
||||
from reader import DataReader, UnbufferedFileReader
|
||||
|
||||
from options import DRY_RUN, PEDANTIC
|
||||
from options import TRUNCATE, VACUUM
|
||||
from options import COUNT, FROM_COUNT, FROM_ID
|
||||
from options import INPUT_ENCODING, PG_CLIENT_ENCODING
|
||||
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
|
||||
from options import NEWLINE_ESCAPES
|
||||
|
||||
class FixedReader(DataReader):
|
||||
"""
|
||||
Read fixed file format, configuration gives for each field
|
||||
- field name
|
||||
- start position
|
||||
- length
|
||||
"""
|
||||
|
||||
def readconfig(self, config, name, template):
|
||||
""" get this reader module configuration from config file """
|
||||
DataReader.readconfig(self, config, name, template)
|
||||
|
||||
# this will be called twice if templates are in used, so we
|
||||
# have to protect ourselves against removing already read
|
||||
# configurations while in second run.
|
||||
|
||||
self._getopt('fixed_specs', config, name, template, None)
|
||||
|
||||
if self.fixed_specs:
|
||||
self.positions = {}
|
||||
# parse the fixed specs
|
||||
specs = [x.strip().split(':') for x in self.fixed_specs.strip().split(',')]
|
||||
try:
|
||||
for name, start, length in specs:
|
||||
self.positions[name] = (int(start), int(length))
|
||||
|
||||
except ValueError, e:
|
||||
self.log.error("%s.fixed_specs, " + \
|
||||
"start and length must be numbers", name)
|
||||
|
||||
raise PGLoader_Error, \
|
||||
"Please fix %s.fixed_specs configuration" % name
|
||||
else:
|
||||
msg = "section %s: fixed format type require 'fixed_specs'" % name
|
||||
raise PGLoader_Error, msg
|
||||
|
||||
self.log.debug('reader.readconfig: positions %s', self.positions)
|
||||
|
||||
def readlines(self):
|
||||
""" read data from configured file, and generate (yields) for
|
||||
each data line: line, columns and rowid """
|
||||
|
||||
self.fd = UnbufferedFileReader(self.filename, self.log,
|
||||
encoding = self.input_encoding,
|
||||
start = self.start,
|
||||
end = self.end)
|
||||
|
||||
line_nb = 0
|
||||
|
||||
for line in self.fd:
|
||||
line_nb += 1
|
||||
line = line.strip("\n")
|
||||
llen = len(line)
|
||||
columns = []
|
||||
|
||||
for cname, cpos in self.columns:
|
||||
start, length = self.positions[cname]
|
||||
|
||||
if llen < (start+length):
|
||||
self.log.error("Line %d is too short " % line_nb +
|
||||
"(column %s requires len >= %d)" \
|
||||
% (cname, start+length))
|
||||
|
||||
msg = "Please review fixed_specs configuration"
|
||||
raise PGLoader_Error, msg
|
||||
|
||||
columns.append(line[start:start+length])
|
||||
|
||||
yield line, columns
|
||||
@ -596,6 +596,17 @@ class PGLoader(threading.Thread):
|
||||
self.input_encoding,
|
||||
self.table, self.columns,
|
||||
self.newline_escapes)
|
||||
|
||||
elif self.format.lower() == 'fixed':
|
||||
from fixedreader import FixedReader
|
||||
self.reader = FixedReader(self.log, self.db, self.reject,
|
||||
self.filename,
|
||||
self.input_encoding,
|
||||
self.table, self.columns)
|
||||
|
||||
else:
|
||||
self.log.error("unknown format '%s'")
|
||||
raise PGLoader_Error, "Skipping section %s" % self.name
|
||||
|
||||
self.log.debug('reader.readconfig()')
|
||||
self.reader.readconfig(config, name, self.tsection)
|
||||
|
||||
21
reformat/pgtime.py
Normal file
21
reformat/pgtime.py
Normal file
@ -0,0 +1,21 @@
|
||||
# Author: Dimitri Fontaine <dim@tapoueh.org>
|
||||
#
|
||||
# pgloader time-related reformating module
|
||||
#
|
||||
|
||||
def time(reject, input):
|
||||
""" Reformat str as a PostgreSQL time
|
||||
|
||||
Input time like: 08231560
|
||||
We want instead this input: 08:23:15.60
|
||||
"""
|
||||
if len(input) != 8:
|
||||
e = "time reformat input too short: %s" % input
|
||||
reject.log(e, input)
|
||||
|
||||
hour = input[0:2]
|
||||
minute = input[2:4]
|
||||
seconds = input[4:6]
|
||||
hundredths = input[6:8]
|
||||
|
||||
return '%s:%s:%s.%s' % (hour, minute, seconds, hundredths)
|
||||
Loading…
x
Reference in New Issue
Block a user