mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-04 18:36:12 +02:00
Implement header parsing for fixed file formats.
Fixed file formats might contain an header line with column names and a hint of the size of each column. While it might be a long shot that we can acutally use that as a proper fixed-format specification, this patch implements a guess mode that also outputs the parsed header. In case when the parsing is wrong in some level of details, it might actually be a good start to copy/paste from the command output and go from there. Fixes #958.
This commit is contained in:
parent
27b1a83b9f
commit
b1d55e07d6
@ -130,8 +130,12 @@
|
||||
;; (:file "csv-database")
|
||||
(:file "csv")))
|
||||
|
||||
(:file "fixed"
|
||||
:depends-on ("common" "csv"))
|
||||
(:module "fixed"
|
||||
:depends-on ("common")
|
||||
:serial t
|
||||
:components
|
||||
((:file "fixed-guess")
|
||||
(:file "fixed")))
|
||||
|
||||
(:file "copy"
|
||||
:depends-on ("common" "csv"))
|
||||
|
||||
@ -7,6 +7,9 @@
|
||||
|
||||
(in-package #:pgloader.parser)
|
||||
|
||||
(defrule option-fixed-header (and kw-fixed kw-header)
|
||||
(:constant (cons :header t)))
|
||||
|
||||
(defrule hex-number (and "0x" (+ (hexdigit-char-p character)))
|
||||
(:lambda (hex)
|
||||
(bind (((_ digits) hex))
|
||||
@ -55,7 +58,8 @@
|
||||
option-drop-indexes
|
||||
option-disable-triggers
|
||||
option-identifiers-case
|
||||
option-skip-header))
|
||||
option-skip-header
|
||||
option-fixed-header))
|
||||
|
||||
(defrule fixed-options (and kw-with
|
||||
(and fixed-option (* (and comma fixed-option))))
|
||||
@ -94,7 +98,7 @@
|
||||
(alexandria:alist-plist clauses-list)))
|
||||
|
||||
(defrule load-fixed-cols-file-command (and fixed-source (? file-encoding)
|
||||
fixed-source-field-list
|
||||
(? fixed-source-field-list)
|
||||
target
|
||||
(? csv-target-table)
|
||||
(? csv-target-column-list)
|
||||
@ -144,7 +148,8 @@
|
||||
:encoding ,encoding
|
||||
:fields ',fields
|
||||
:columns ',columns
|
||||
:skip-lines ,(or (getf options :skip-line) 0))))
|
||||
:skip-lines ,(or (getf options :skip-lines) 0)
|
||||
:header ,(getf options :header))))
|
||||
|
||||
(copy-database source
|
||||
,@ (when worker-count
|
||||
|
||||
74
src/sources/fixed/fixed-guess.lisp
Normal file
74
src/sources/fixed/fixed-guess.lisp
Normal file
@ -0,0 +1,74 @@
|
||||
;;;
|
||||
;;; Given a list of columns in PostgreSQL, try to guess the fixed format
|
||||
;;; specification from a sample of the file.
|
||||
;;;
|
||||
|
||||
(in-package :pgloader.source.fixed)
|
||||
|
||||
(defgeneric get-first-lines (filename-or-stream &optional n)
|
||||
(:documentation "Get the first line of given FILENAME-OR-STREAM.")
|
||||
(:method ((stream stream) &optional (n 1))
|
||||
(let ((pos (file-position stream)))
|
||||
(file-position stream 0)
|
||||
(prog1
|
||||
(loop :repeat n
|
||||
:for line := (read-line stream nil nil)
|
||||
:while line
|
||||
:collect line)
|
||||
(file-position stream pos))))
|
||||
(:method ((filename string) &optional (n 1))
|
||||
(with-open-file (stream filename
|
||||
:direction :input
|
||||
:external-format :utf-8
|
||||
:if-does-not-exist nil)
|
||||
(loop :repeat n
|
||||
:for line := (read-line stream nil nil)
|
||||
:while line
|
||||
:collect line))))
|
||||
|
||||
(defun guess-fixed-specs-from-header (header)
|
||||
"Try to guess fixed specs from whitespace in the first line of the file."
|
||||
(let* ((size (length header))
|
||||
current-field-name (current-field-start 0)
|
||||
specs)
|
||||
(loop :for pos :from 0
|
||||
:for previous-char := #\Space :then current-char
|
||||
:for current-char :across header
|
||||
:do (cond ((or (= (+ 1 pos) size) ; last char
|
||||
(and (< 0 pos) ; new field
|
||||
(char= #\Space previous-char)
|
||||
(char/= #\Space current-char)))
|
||||
(when (= (+ 1 pos) size)
|
||||
(push current-char current-field-name))
|
||||
(push (list (map 'string #'identity
|
||||
(reverse current-field-name))
|
||||
:start current-field-start
|
||||
:length (- pos current-field-start)
|
||||
:null-as :blanks
|
||||
:trim-right t)
|
||||
specs)
|
||||
(setf current-field-name (list current-char))
|
||||
(setf current-field-start pos))
|
||||
|
||||
((char/= #\Space current-char)
|
||||
(push current-char current-field-name))))
|
||||
(reverse specs)))
|
||||
|
||||
(defun guess-fixed-specs (filename-or-stream &optional (sample-size 1000))
|
||||
"Use the first line as an header to guess the specification of the fixed
|
||||
file from, and then match that against a sample of data from the file to
|
||||
see if that matches what data we have there."
|
||||
(let* ((sample (get-first-lines filename-or-stream sample-size))
|
||||
(header (first sample))
|
||||
(data (rest sample))
|
||||
(fields (guess-fixed-specs-from-header header))
|
||||
(specs (mapcar #'cdr fields)))
|
||||
(loop :for line :in data
|
||||
:collect (handler-case
|
||||
(parse-row specs line)
|
||||
(condition (e)
|
||||
(log-message :error
|
||||
"Fixed: failed to use header as ~
|
||||
specification for columns: ~a" e)
|
||||
(return-from guess-fixed-specs nil))))
|
||||
fields))
|
||||
@ -10,12 +10,7 @@
|
||||
"Assign the type slot to sqlite."
|
||||
(setf (slot-value fixed 'type) "fixed"))
|
||||
|
||||
(defclass copy-fixed (md-copy)
|
||||
((encoding :accessor encoding ; file encoding
|
||||
:initarg :encoding) ;
|
||||
(skip-lines :accessor skip-lines ; CSV headers
|
||||
:initarg :skip-lines ;
|
||||
:initform 0))
|
||||
(defclass copy-fixed (md-copy) ()
|
||||
(:documentation "pgloader Fixed Columns Data Source"))
|
||||
|
||||
(defmethod clone-copy-for ((fixed copy-fixed) path-spec)
|
||||
@ -30,6 +25,29 @@
|
||||
;; return the new instance!
|
||||
fixed-clone))
|
||||
|
||||
(defmethod parse-header ((fixed copy-fixed))
|
||||
"Parse the header line given a FIXED setup."
|
||||
(with-connection (cnx (source fixed)
|
||||
:direction :input
|
||||
:external-format (encoding fixed)
|
||||
:if-does-not-exist nil)
|
||||
(let ((input (md-strm cnx)))
|
||||
(loop :repeat (skip-lines fixed) :do (read-line input nil nil))
|
||||
(let* ((field-spec-list (guess-fixed-specs input))
|
||||
(specifications
|
||||
(loop :for specs :in field-spec-list
|
||||
:collect (destructuring-bind (name &key start length
|
||||
&allow-other-keys)
|
||||
specs
|
||||
(format nil
|
||||
"~a from ~d for ~d ~a"
|
||||
name start length
|
||||
"[null if blanks, trim right whitespace]")))))
|
||||
(setf (fields fixed) field-spec-list)
|
||||
(log-message :log
|
||||
"Parsed ~d columns specs from header:~%(~%~{ ~a~^,~%~}~%)"
|
||||
(length (fields fixed)) specifications)))))
|
||||
|
||||
(declaim (inline parse-row))
|
||||
|
||||
(defun parse-row (fixed-cols-specs line)
|
||||
7
test/fixed-guess.dat
Normal file
7
test/fixed-guess.dat
Normal file
@ -0,0 +1,7 @@
|
||||
QECDPO QECSEM QELSEM QERSEM QENCOM QESCRE QEACRE QEMCRE QEJCRE QEHCRE QECUCR QESMAJ QEAMAJ QEMMAJ QEJMAJ QEHMAJ QECUMJ QETOPD
|
||||
ACT INV Inventaire par niveau Par niveau 0 20 7 10 29 180457 HLJFD 20 7 10 29 180457 HLJFD 0
|
||||
ACT IPI Inventaire pickings / colonnes Picking / Colo. 0 20 8 4 28 164330 HLJFD 20 8 4 28 164330 HLJFD 0
|
||||
ACT STB SEQUENCE STANDARD BACKUP STANDARD 0 20 12 11 13 154308 HLJFL 20 12 11 13 154308 HLJFL 0
|
||||
ACT STC SEQUENCE backup 250913 STANDARD 0 20 13 9 25 161133 HLJFL 20 13 9 25 161133 HLJFL 0
|
||||
ACT STD SEQUENCE STANDARD STANDARD 0 20 12 11 13 154813 HLJFL 20 12 11 13 154813 HLJFL 0
|
||||
ACT TTT test test 0 20 12 11 13 102211 HLJFL 20 12 11 13 102211 HLJFL 0
|
||||
36
test/fixed-guess.load
Normal file
36
test/fixed-guess.load
Normal file
@ -0,0 +1,36 @@
|
||||
LOAD FIXED
|
||||
FROM fixed-guess.dat
|
||||
INTO postgresql:///pgloader
|
||||
TARGET TABLE fixed.guess
|
||||
|
||||
WITH fixed header
|
||||
|
||||
SET work_mem to '14MB',
|
||||
standard_conforming_strings to 'on'
|
||||
|
||||
before load do
|
||||
$$ create schema if not exists fixed; $$,
|
||||
$$ drop table if exists fixed.guess; $$,
|
||||
$$
|
||||
create table fixed.guess
|
||||
(
|
||||
QECDPO character varying(3),
|
||||
QECSEM character varying(3),
|
||||
QELSEM character varying(30),
|
||||
QERSEM character varying(15),
|
||||
QENCOM integer,
|
||||
QESCRE smallint,
|
||||
QEACRE smallint,
|
||||
QEMCRE smallint,
|
||||
QEJCRE smallint,
|
||||
QEHCRE integer,
|
||||
QECUCR character varying(10),
|
||||
QESMAJ smallint,
|
||||
QEAMAJ smallint,
|
||||
QEMMAJ smallint,
|
||||
QEJMAJ smallint,
|
||||
QEHMAJ integer,
|
||||
QECUMJ character varying(10),
|
||||
QETOPD character varying(1)
|
||||
);
|
||||
$$ ;
|
||||
Loading…
x
Reference in New Issue
Block a user