diff --git a/pgloader.asd b/pgloader.asd index 9b1034c..aeb14b7 100644 --- a/pgloader.asd +++ b/pgloader.asd @@ -130,8 +130,12 @@ ;; (:file "csv-database") (:file "csv"))) - (:file "fixed" - :depends-on ("common" "csv")) + (:module "fixed" + :depends-on ("common") + :serial t + :components + ((:file "fixed-guess") + (:file "fixed"))) (:file "copy" :depends-on ("common" "csv")) diff --git a/src/parsers/command-fixed.lisp b/src/parsers/command-fixed.lisp index 9fe2d78..e0522c1 100644 --- a/src/parsers/command-fixed.lisp +++ b/src/parsers/command-fixed.lisp @@ -7,6 +7,9 @@ (in-package #:pgloader.parser) +(defrule option-fixed-header (and kw-fixed kw-header) + (:constant (cons :header t))) + (defrule hex-number (and "0x" (+ (hexdigit-char-p character))) (:lambda (hex) (bind (((_ digits) hex)) @@ -55,7 +58,8 @@ option-drop-indexes option-disable-triggers option-identifiers-case - option-skip-header)) + option-skip-header + option-fixed-header)) (defrule fixed-options (and kw-with (and fixed-option (* (and comma fixed-option)))) @@ -94,7 +98,7 @@ (alexandria:alist-plist clauses-list))) (defrule load-fixed-cols-file-command (and fixed-source (? file-encoding) - fixed-source-field-list + (? fixed-source-field-list) target (? csv-target-table) (? csv-target-column-list) @@ -144,7 +148,8 @@ :encoding ,encoding :fields ',fields :columns ',columns - :skip-lines ,(or (getf options :skip-line) 0)))) + :skip-lines ,(or (getf options :skip-lines) 0) + :header ,(getf options :header)))) (copy-database source ,@ (when worker-count diff --git a/src/sources/fixed/fixed-guess.lisp b/src/sources/fixed/fixed-guess.lisp new file mode 100644 index 0000000..7b16a24 --- /dev/null +++ b/src/sources/fixed/fixed-guess.lisp @@ -0,0 +1,74 @@ +;;; +;;; Given a list of columns in PostgreSQL, try to guess the fixed format +;;; specification from a sample of the file. +;;; + +(in-package :pgloader.source.fixed) + +(defgeneric get-first-lines (filename-or-stream &optional n) + (:documentation "Get the first line of given FILENAME-OR-STREAM.") + (:method ((stream stream) &optional (n 1)) + (let ((pos (file-position stream))) + (file-position stream 0) + (prog1 + (loop :repeat n + :for line := (read-line stream nil nil) + :while line + :collect line) + (file-position stream pos)))) + (:method ((filename string) &optional (n 1)) + (with-open-file (stream filename + :direction :input + :external-format :utf-8 + :if-does-not-exist nil) + (loop :repeat n + :for line := (read-line stream nil nil) + :while line + :collect line)))) + +(defun guess-fixed-specs-from-header (header) + "Try to guess fixed specs from whitespace in the first line of the file." + (let* ((size (length header)) + current-field-name (current-field-start 0) + specs) + (loop :for pos :from 0 + :for previous-char := #\Space :then current-char + :for current-char :across header + :do (cond ((or (= (+ 1 pos) size) ; last char + (and (< 0 pos) ; new field + (char= #\Space previous-char) + (char/= #\Space current-char))) + (when (= (+ 1 pos) size) + (push current-char current-field-name)) + (push (list (map 'string #'identity + (reverse current-field-name)) + :start current-field-start + :length (- pos current-field-start) + :null-as :blanks + :trim-right t) + specs) + (setf current-field-name (list current-char)) + (setf current-field-start pos)) + + ((char/= #\Space current-char) + (push current-char current-field-name)))) + (reverse specs))) + +(defun guess-fixed-specs (filename-or-stream &optional (sample-size 1000)) + "Use the first line as an header to guess the specification of the fixed + file from, and then match that against a sample of data from the file to + see if that matches what data we have there." + (let* ((sample (get-first-lines filename-or-stream sample-size)) + (header (first sample)) + (data (rest sample)) + (fields (guess-fixed-specs-from-header header)) + (specs (mapcar #'cdr fields))) + (loop :for line :in data + :collect (handler-case + (parse-row specs line) + (condition (e) + (log-message :error + "Fixed: failed to use header as ~ +specification for columns: ~a" e) + (return-from guess-fixed-specs nil)))) + fields)) diff --git a/src/sources/fixed.lisp b/src/sources/fixed/fixed.lisp similarity index 63% rename from src/sources/fixed.lisp rename to src/sources/fixed/fixed.lisp index 8d36a8f..03f36d5 100644 --- a/src/sources/fixed.lisp +++ b/src/sources/fixed/fixed.lisp @@ -10,12 +10,7 @@ "Assign the type slot to sqlite." (setf (slot-value fixed 'type) "fixed")) -(defclass copy-fixed (md-copy) - ((encoding :accessor encoding ; file encoding - :initarg :encoding) ; - (skip-lines :accessor skip-lines ; CSV headers - :initarg :skip-lines ; - :initform 0)) +(defclass copy-fixed (md-copy) () (:documentation "pgloader Fixed Columns Data Source")) (defmethod clone-copy-for ((fixed copy-fixed) path-spec) @@ -30,6 +25,29 @@ ;; return the new instance! fixed-clone)) +(defmethod parse-header ((fixed copy-fixed)) + "Parse the header line given a FIXED setup." + (with-connection (cnx (source fixed) + :direction :input + :external-format (encoding fixed) + :if-does-not-exist nil) + (let ((input (md-strm cnx))) + (loop :repeat (skip-lines fixed) :do (read-line input nil nil)) + (let* ((field-spec-list (guess-fixed-specs input)) + (specifications + (loop :for specs :in field-spec-list + :collect (destructuring-bind (name &key start length + &allow-other-keys) + specs + (format nil + "~a from ~d for ~d ~a" + name start length + "[null if blanks, trim right whitespace]"))))) + (setf (fields fixed) field-spec-list) + (log-message :log + "Parsed ~d columns specs from header:~%(~%~{ ~a~^,~%~}~%)" + (length (fields fixed)) specifications))))) + (declaim (inline parse-row)) (defun parse-row (fixed-cols-specs line) diff --git a/test/fixed-guess.dat b/test/fixed-guess.dat new file mode 100644 index 0000000..a9e5536 --- /dev/null +++ b/test/fixed-guess.dat @@ -0,0 +1,7 @@ +QECDPO QECSEM QELSEM QERSEM QENCOM QESCRE QEACRE QEMCRE QEJCRE QEHCRE QECUCR QESMAJ QEAMAJ QEMMAJ QEJMAJ QEHMAJ QECUMJ QETOPD +ACT INV Inventaire par niveau Par niveau 0 20 7 10 29 180457 HLJFD 20 7 10 29 180457 HLJFD 0 +ACT IPI Inventaire pickings / colonnes Picking / Colo. 0 20 8 4 28 164330 HLJFD 20 8 4 28 164330 HLJFD 0 +ACT STB SEQUENCE STANDARD BACKUP STANDARD 0 20 12 11 13 154308 HLJFL 20 12 11 13 154308 HLJFL 0 +ACT STC SEQUENCE backup 250913 STANDARD 0 20 13 9 25 161133 HLJFL 20 13 9 25 161133 HLJFL 0 +ACT STD SEQUENCE STANDARD STANDARD 0 20 12 11 13 154813 HLJFL 20 12 11 13 154813 HLJFL 0 +ACT TTT test test 0 20 12 11 13 102211 HLJFL 20 12 11 13 102211 HLJFL 0 \ No newline at end of file diff --git a/test/fixed-guess.load b/test/fixed-guess.load new file mode 100644 index 0000000..1968638 --- /dev/null +++ b/test/fixed-guess.load @@ -0,0 +1,36 @@ +LOAD FIXED + FROM fixed-guess.dat + INTO postgresql:///pgloader + TARGET TABLE fixed.guess + + WITH fixed header + + SET work_mem to '14MB', + standard_conforming_strings to 'on' + + before load do + $$ create schema if not exists fixed; $$, + $$ drop table if exists fixed.guess; $$, + $$ + create table fixed.guess + ( + QECDPO character varying(3), + QECSEM character varying(3), + QELSEM character varying(30), + QERSEM character varying(15), + QENCOM integer, + QESCRE smallint, + QEACRE smallint, + QEMCRE smallint, + QEJCRE smallint, + QEHCRE integer, + QECUCR character varying(10), + QESMAJ smallint, + QEAMAJ smallint, + QEMMAJ smallint, + QEJMAJ smallint, + QEHMAJ integer, + QECUMJ character varying(10), + QETOPD character varying(1) + ); + $$ ;