Implement header parsing for fixed file formats.

Fixed file formats might contain an header line with column names and a hint
of the size of each column. While it might be a long shot that we can
acutally use that as a proper fixed-format specification, this patch
implements a guess mode that also outputs the parsed header.

In case when the parsing is wrong in some level of details, it might
actually be a good start to copy/paste from the command output and go from
there.

Fixes #958.
This commit is contained in:
Dimitri Fontaine 2019-05-07 22:54:57 +02:00
parent 27b1a83b9f
commit b1d55e07d6
6 changed files with 155 additions and 11 deletions

View File

@ -130,8 +130,12 @@
;; (:file "csv-database")
(:file "csv")))
(:file "fixed"
:depends-on ("common" "csv"))
(:module "fixed"
:depends-on ("common")
:serial t
:components
((:file "fixed-guess")
(:file "fixed")))
(:file "copy"
:depends-on ("common" "csv"))

View File

@ -7,6 +7,9 @@
(in-package #:pgloader.parser)
(defrule option-fixed-header (and kw-fixed kw-header)
(:constant (cons :header t)))
(defrule hex-number (and "0x" (+ (hexdigit-char-p character)))
(:lambda (hex)
(bind (((_ digits) hex))
@ -55,7 +58,8 @@
option-drop-indexes
option-disable-triggers
option-identifiers-case
option-skip-header))
option-skip-header
option-fixed-header))
(defrule fixed-options (and kw-with
(and fixed-option (* (and comma fixed-option))))
@ -94,7 +98,7 @@
(alexandria:alist-plist clauses-list)))
(defrule load-fixed-cols-file-command (and fixed-source (? file-encoding)
fixed-source-field-list
(? fixed-source-field-list)
target
(? csv-target-table)
(? csv-target-column-list)
@ -144,7 +148,8 @@
:encoding ,encoding
:fields ',fields
:columns ',columns
:skip-lines ,(or (getf options :skip-line) 0))))
:skip-lines ,(or (getf options :skip-lines) 0)
:header ,(getf options :header))))
(copy-database source
,@ (when worker-count

View File

@ -0,0 +1,74 @@
;;;
;;; Given a list of columns in PostgreSQL, try to guess the fixed format
;;; specification from a sample of the file.
;;;
(in-package :pgloader.source.fixed)
(defgeneric get-first-lines (filename-or-stream &optional n)
(:documentation "Get the first line of given FILENAME-OR-STREAM.")
(:method ((stream stream) &optional (n 1))
(let ((pos (file-position stream)))
(file-position stream 0)
(prog1
(loop :repeat n
:for line := (read-line stream nil nil)
:while line
:collect line)
(file-position stream pos))))
(:method ((filename string) &optional (n 1))
(with-open-file (stream filename
:direction :input
:external-format :utf-8
:if-does-not-exist nil)
(loop :repeat n
:for line := (read-line stream nil nil)
:while line
:collect line))))
(defun guess-fixed-specs-from-header (header)
"Try to guess fixed specs from whitespace in the first line of the file."
(let* ((size (length header))
current-field-name (current-field-start 0)
specs)
(loop :for pos :from 0
:for previous-char := #\Space :then current-char
:for current-char :across header
:do (cond ((or (= (+ 1 pos) size) ; last char
(and (< 0 pos) ; new field
(char= #\Space previous-char)
(char/= #\Space current-char)))
(when (= (+ 1 pos) size)
(push current-char current-field-name))
(push (list (map 'string #'identity
(reverse current-field-name))
:start current-field-start
:length (- pos current-field-start)
:null-as :blanks
:trim-right t)
specs)
(setf current-field-name (list current-char))
(setf current-field-start pos))
((char/= #\Space current-char)
(push current-char current-field-name))))
(reverse specs)))
(defun guess-fixed-specs (filename-or-stream &optional (sample-size 1000))
"Use the first line as an header to guess the specification of the fixed
file from, and then match that against a sample of data from the file to
see if that matches what data we have there."
(let* ((sample (get-first-lines filename-or-stream sample-size))
(header (first sample))
(data (rest sample))
(fields (guess-fixed-specs-from-header header))
(specs (mapcar #'cdr fields)))
(loop :for line :in data
:collect (handler-case
(parse-row specs line)
(condition (e)
(log-message :error
"Fixed: failed to use header as ~
specification for columns: ~a" e)
(return-from guess-fixed-specs nil))))
fields))

View File

@ -10,12 +10,7 @@
"Assign the type slot to sqlite."
(setf (slot-value fixed 'type) "fixed"))
(defclass copy-fixed (md-copy)
((encoding :accessor encoding ; file encoding
:initarg :encoding) ;
(skip-lines :accessor skip-lines ; CSV headers
:initarg :skip-lines ;
:initform 0))
(defclass copy-fixed (md-copy) ()
(:documentation "pgloader Fixed Columns Data Source"))
(defmethod clone-copy-for ((fixed copy-fixed) path-spec)
@ -30,6 +25,29 @@
;; return the new instance!
fixed-clone))
(defmethod parse-header ((fixed copy-fixed))
"Parse the header line given a FIXED setup."
(with-connection (cnx (source fixed)
:direction :input
:external-format (encoding fixed)
:if-does-not-exist nil)
(let ((input (md-strm cnx)))
(loop :repeat (skip-lines fixed) :do (read-line input nil nil))
(let* ((field-spec-list (guess-fixed-specs input))
(specifications
(loop :for specs :in field-spec-list
:collect (destructuring-bind (name &key start length
&allow-other-keys)
specs
(format nil
"~a from ~d for ~d ~a"
name start length
"[null if blanks, trim right whitespace]")))))
(setf (fields fixed) field-spec-list)
(log-message :log
"Parsed ~d columns specs from header:~%(~%~{ ~a~^,~%~}~%)"
(length (fields fixed)) specifications)))))
(declaim (inline parse-row))
(defun parse-row (fixed-cols-specs line)

7
test/fixed-guess.dat Normal file
View File

@ -0,0 +1,7 @@
QECDPO QECSEM QELSEM QERSEM QENCOM QESCRE QEACRE QEMCRE QEJCRE QEHCRE QECUCR QESMAJ QEAMAJ QEMMAJ QEJMAJ QEHMAJ QECUMJ QETOPD
ACT INV Inventaire par niveau Par niveau 0 20 7 10 29 180457 HLJFD 20 7 10 29 180457 HLJFD 0
ACT IPI Inventaire pickings / colonnes Picking / Colo. 0 20 8 4 28 164330 HLJFD 20 8 4 28 164330 HLJFD 0
ACT STB SEQUENCE STANDARD BACKUP STANDARD 0 20 12 11 13 154308 HLJFL 20 12 11 13 154308 HLJFL 0
ACT STC SEQUENCE backup 250913 STANDARD 0 20 13 9 25 161133 HLJFL 20 13 9 25 161133 HLJFL 0
ACT STD SEQUENCE STANDARD STANDARD 0 20 12 11 13 154813 HLJFL 20 12 11 13 154813 HLJFL 0
ACT TTT test test 0 20 12 11 13 102211 HLJFL 20 12 11 13 102211 HLJFL 0

36
test/fixed-guess.load Normal file
View File

@ -0,0 +1,36 @@
LOAD FIXED
FROM fixed-guess.dat
INTO postgresql:///pgloader
TARGET TABLE fixed.guess
WITH fixed header
SET work_mem to '14MB',
standard_conforming_strings to 'on'
before load do
$$ create schema if not exists fixed; $$,
$$ drop table if exists fixed.guess; $$,
$$
create table fixed.guess
(
QECDPO character varying(3),
QECSEM character varying(3),
QELSEM character varying(30),
QERSEM character varying(15),
QENCOM integer,
QESCRE smallint,
QEACRE smallint,
QEMCRE smallint,
QEJCRE smallint,
QEHCRE integer,
QECUCR character varying(10),
QESMAJ smallint,
QEAMAJ smallint,
QEMMAJ smallint,
QEJMAJ smallint,
QEHMAJ integer,
QECUMJ character varying(10),
QETOPD character varying(1)
);
$$ ;