From 7cf7e714fcb4f03ff3169e64e4045fb2dd5872f6 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 2 Oct 2014 01:00:03 +0200 Subject: [PATCH] Implement the source date format option. --- pgloader.1 | 102 +++++++++++++++++- pgloader.1.md | 40 +++++++- pgloader.asd | 7 +- src/package.lisp | 9 +- src/parsers/date-format.lisp | 125 +++++++++++++++++++++++ src/sources.lisp | 29 ++++-- test/Makefile | 1 + test/csv-parse-date.load | 24 +++++ test/regress/expected/csv-parse-date.out | 0 9 files changed, 319 insertions(+), 18 deletions(-) create mode 100644 src/parsers/date-format.lisp create mode 100644 test/csv-parse-date.load create mode 100644 test/regress/expected/csv-parse-date.out diff --git a/pgloader.1 b/pgloader.1 index d2fc025..6157c3a 100644 --- a/pgloader.1 +++ b/pgloader.1 @@ -534,7 +534,56 @@ The processing of this option is not currently implemented\. When the field is expected of the date type, then this option allows to specify the date format used in the file\. . .IP -The processing of this option is not currently implemented\. +Date format string are template strings modeled against the PostgreSQL \fBto_char\fR template strings support, limited to the following patterns: +. +.IP "\(bu" 4 +YYYY, YYY, YY for the year part +. +.IP "\(bu" 4 +MM for the numeric month part +. +.IP "\(bu" 4 +DD for the numeric day part +. +.IP "\(bu" 4 +HH, HH12, HH24 for the hour part +. +.IP "\(bu" 4 +am, AM, a\.m\., A\.M\. +. +.IP "\(bu" 4 +pm, PM, p\.m\., P\.M\. +. +.IP "\(bu" 4 +MI for the minutes part +. +.IP "\(bu" 4 +SS for the seconds part +. +.IP "\(bu" 4 +MS for the milliseconds part (4 digits) +. +.IP "\(bu" 4 +US for the microseconds part (6 digits) +. +.IP "\(bu" 4 +unparsed punctuation signs: \- \. * # @ T / \e and space +. +.IP "" 0 +. +.IP +Here\'s an example of a \fIdate format\fR specification: +. +.IP "" 4 +. +.nf + +column\-name [date format \'YYYY\-MM\-DD HH24\-MI\-SS\.US\'] +. +.fi +. +.IP "" 0 + . .IP "\(bu" 4 \fInull if\fR @@ -744,7 +793,56 @@ The processing of this option is not currently implemented\. When the field is expected of the date type, then this option allows to specify the date format used in the file\. . .IP -The processing of this option is not currently implemented\. +Date format string are template strings modeled against the PostgreSQL \fBto_char\fR template strings support, limited to the following patterns: +. +.IP "\(bu" 4 +YYYY, YYY, YY for the year part +. +.IP "\(bu" 4 +MM for the numeric month part +. +.IP "\(bu" 4 +DD for the numeric day part +. +.IP "\(bu" 4 +HH, HH12, HH24 for the hour part +. +.IP "\(bu" 4 +am, AM, a\.m\., A\.M\. +. +.IP "\(bu" 4 +pm, PM, p\.m\., P\.M\. +. +.IP "\(bu" 4 +MI for the minutes part +. +.IP "\(bu" 4 +SS for the seconds part +. +.IP "\(bu" 4 +MS for the milliseconds part (4 digits) +. +.IP "\(bu" 4 +US for the microseconds part (6 digits) +. +.IP "\(bu" 4 +unparsed punctuation signs: \- \. * # @ T / \e and space +. +.IP "" 0 +. +.IP +Here\'s an example of a \fIdate format\fR specification: +. +.IP "" 4 +. +.nf + +column\-name [date format \'YYYY\-MM\-DD HH24\-MI\-SS\.US\'] +. +.fi +. +.IP "" 0 + . .IP "\(bu" 4 \fInull if\fR diff --git a/pgloader.1.md b/pgloader.1.md index b6b75e0..16558bd 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -479,7 +479,25 @@ The `csv` format command accepts the following clauses and options: When the field is expected of the date type, then this option allows to specify the date format used in the file. - The processing of this option is not currently implemented. + Date format string are template strings modeled against the + PostgreSQL `to_char` template strings support, limited to the + following patterns: + + - YYYY, YYY, YY for the year part + - MM for the numeric month part + - DD for the numeric day part + - HH, HH12, HH24 for the hour part + - am, AM, a.m., A.M. + - pm, PM, p.m., P.M. + - MI for the minutes part + - SS for the seconds part + - MS for the milliseconds part (4 digits) + - US for the microseconds part (6 digits) + - unparsed punctuation signs: - . * # @ T / \ and space + + Here's an example of a *date format* specification: + + column-name [date format 'YYYY-MM-DD HH24-MI-SS.US'] - *null if* @@ -664,7 +682,25 @@ The `fixed` format command accepts the following clauses and options: When the field is expected of the date type, then this option allows to specify the date format used in the file. - The processing of this option is not currently implemented. + Date format string are template strings modeled against the + PostgreSQL `to_char` template strings support, limited to the + following patterns: + + - YYYY, YYY, YY for the year part + - MM for the numeric month part + - DD for the numeric day part + - HH, HH12, HH24 for the hour part + - am, AM, a.m., A.M. + - pm, PM, p.m., P.M. + - MI for the minutes part + - SS for the seconds part + - MS for the milliseconds part (4 digits) + - US for the microseconds part (6 digits) + - unparsed punctuation signs: - . * # @ T / \ and space + + Here's an example of a *date format* specification: + + column-name [date format 'YYYY-MM-DD HH24-MI-SS.US'] - *null if* diff --git a/pgloader.asd b/pgloader.asd index e58be78..73ae63c 100644 --- a/pgloader.asd +++ b/pgloader.asd @@ -42,13 +42,13 @@ :depends-on ("package" "params") :components ((:file "charsets") - (:file "archive") (:file "threads") (:file "logs") (:file "monitor" :depends-on ("logs")) (:file "state") (:file "report" :depends-on ("state")) (:file "utils" :depends-on ("charsets" "monitor")) + (:file "archive" :depends-on ("logs")) ;; those are one-package-per-file (:file "transforms") @@ -70,12 +70,13 @@ :depends-on ("params" "package" "utils" "pgsql") :components ((:file "parse-ini") - (:file "parser"))) + (:file "parser") + (:file "date-format"))) ;; generic API for Sources (:file "sources-api" :pathname "sources" - :depends-on ("params" "package" "utils")) + :depends-on ("params" "package" "utils" "parsers")) ;; Source format specific implementations (:module sources diff --git a/src/package.lisp b/src/package.lisp index 2b56e7a..89dabe2 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -54,7 +54,6 @@ #:report-summary #:report-full-summary #:with-stats-collection - #:slurp-file-into-string #:camelCase-to-colname #:make-kernel #:list-encodings-and-aliases @@ -96,8 +95,16 @@ #:format-pgsql-create-index #:create-indexes-in-kernel)) +(defpackage #:pgloader.parse-date + (:use #:cl #:esrap) + (:export #:parse-date-string + #:parse-date-format)) + (defpackage #:pgloader.sources (:use #:cl #:pgloader.params #:pgloader.utils) + (:import-from #:pgloader.parse-date + #:parse-date-string + #:parse-date-format) (:export #:copy #:source-db #:target-db diff --git a/src/parsers/date-format.lisp b/src/parsers/date-format.lisp new file mode 100644 index 0000000..3fc2fdf --- /dev/null +++ b/src/parsers/date-format.lisp @@ -0,0 +1,125 @@ +;;; +;;; Parse PostgreSQL to_char() date format strings and transform them into a +;;; lisp date format that the parse-date-string function knows how to deal +;;; with. +;;; +;;; http://www.postgresql.org/docs/current/static/functions-formatting.html#FUNCTIONS-FORMATTING-DATETIME-TABLE +;;; + +(in-package #:pgloader.parse-date) + +(defvar *century* 2000) + +(defun parse-date-string + (date-string + &optional (format '((:year 0 4) + (:month 4 6) + (:day 6 8) + (:hour 8 10) + (:minute 10 12) + (:seconds 12 14) + (:msecs nil nil) + (:usecs nil nil) + + (:am nil nil) + (:pm nil nil)))) + "Apply this function when input date in like '20041002152952'" + ;; only process non-zero dates + (declare (type (or null string) date-string)) + (cond ((null date-string) nil) + ((string= date-string "") nil) + (t + (destructuring-bind (&key year month day hour minute seconds + am pm + msecs usecs + &allow-other-keys) + (loop + :for (name start end) :in format + :when (and start end) + :append (list name (subseq date-string start end))) + (if (or (string= year "0000") + (string= month "00") + (string= day "00")) + nil + (with-output-to-string (s) + (format s "~a-~a-~a ~a:~a:~a" + (let ((yint (parse-integer year))) + (if (< yint *century*) (+ *century* yint) yint)) + month + day + (let ((hint (parse-integer hour))) + (cond ((and am (= hint 12)) "00") + ((and pm (= hint 12)) "12") + ((and pm (< hint 12)) (+ hint 12)) + (t hour))) + minute + seconds) + (if usecs (format s ".~a" usecs) + (when msecs (format s ".~a" msecs))))))))) + + +;;; +;;; Parse the date format +;;; YYYY-MM-DD HH24-MI-SS +;;; +(defvar *offset* 0) + +(defun parse-date-format (format-string) + "Parse a given format string and return a format specification for + parse-date-string" + (let ((*offset* 0)) + (remove nil (parse 'format-string format-string)))) + +(defrule format-string (* (or year + month + day + hour + ampm + minute + seconds + milliseconds + microseconds + noise))) + +(defrule noise (+ (or #\- #\. #\Space #\* #\# #\@ #\/ #\\ "T")) + (:lambda (x) (incf *offset* (length (text x))) nil)) + +(defrule year (or year4 year3 year2)) +(defrule year4 "YYYY" (:lambda (x) (build-format-spec x :year 4))) +(defrule year3 "YYY" (:lambda (x) (build-format-spec x :year 3))) +(defrule year2 "YY" (:lambda (x) (build-format-spec x :year 2))) + +(defrule month mm) +(defrule mm "MM" (:lambda (x) (build-format-spec x :month 2))) + +(defrule day dd) +(defrule dd "DD" (:lambda (x) (build-format-spec x :day 2))) + +(defrule hour (or hh24 hh12 hh)) +(defrule hh "HH" (:lambda (x) (build-format-spec x :hour 2))) +(defrule hh12 "HH12" (:lambda (x) (build-format-spec x :hour 2))) +(defrule hh24 "HH24" (:lambda (x) (build-format-spec x :hour 2))) + +(defrule ampm (or am pm am-dot pm-dot)) +(defrule am (or "AM" "am" ) (:lambda (x) (build-format-spec x :am 2))) +(defrule am-dot (or "a.m." "A.M.") (:lambda (x) (build-format-spec x :am 4))) +(defrule pm (or "PM" "pm") (:lambda (x) (build-format-spec x :pm 2))) +(defrule pm-dot (or "p.m." "P.M.") (:lambda (x) (build-format-spec x :pm 4))) + +(defrule minute mi) +(defrule mi "MI" (:lambda (x) (build-format-spec x :minute 2))) + +(defrule seconds ss) +(defrule ss "SS" (:lambda (x) (build-format-spec x :seconds 2))) + +(defrule milliseconds ms) +(defrule ms "MS" (:lambda (x) (build-format-spec x :msecs 4))) + +(defrule microseconds us) +(defrule us "US" (:lambda (x) (build-format-spec x :usecs 6))) + +(defun build-format-spec (format-string part len) + (declare (ignore format-string)) + (prog1 + (list part *offset* (+ *offset* len)) + (incf *offset* len))) diff --git a/src/sources.lisp b/src/sources.lisp index 1512dcc..a8aa8b1 100644 --- a/src/sources.lisp +++ b/src/sources.lisp @@ -248,10 +248,19 @@ (if (null null-as) col (funcall (null-as-processing-fn null-as) col)))) (when value-or-null - (cond (trim-both (string-trim '(#\Space) value-or-null)) - (trim-left (string-left-trim '(#\Space) value-or-null)) - (trim-right (string-right-trim '(#\Space) value-or-null)) - (t value-or-null)))))))) + (let ((value-or-null + (cond (trim-both + (string-trim '(#\Space) value-or-null)) + (trim-left + (string-left-trim '(#\Space) value-or-null)) + (trim-right + (string-right-trim '(#\Space) value-or-null)) + (t value-or-null)))) + ;; now apply the date format, when given + (if date-format + (parse-date-string value-or-null + (parse-date-format date-format)) + value-or-null)))))))) (let* ((projection (cond @@ -269,12 +278,12 @@ (mapcar (function process-field) fields))) `(lambda (row) (let ((v (make-array (length row)))) - (loop - :for i :from 0 - :for col :in row - :for fn :in ',process-nulls - :do (setf (aref v i) (funcall fn col))) - v)))) + (loop + :for i :from 0 + :for col :in row + :for fn :in ',process-nulls + :do (setf (aref v i) (funcall fn col))) + v)))) (t ;; project some number of FIELDS into a possibly different diff --git a/test/Makefile b/test/Makefile index 3fa3c6c..b6644ba 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,6 +7,7 @@ LOCAL = $(filter-out $(REMOTE:.load=.out),$(OUT)) REGRESS= allcols.load \ csv-before-after.load \ csv-districts.load \ + csv-parse-date.load \ csv-error.load \ csv-filename-pattern.load \ csv-keep-extra-blanks.load \ diff --git a/test/csv-parse-date.load b/test/csv-parse-date.load new file mode 100644 index 0000000..43eb57b --- /dev/null +++ b/test/csv-parse-date.load @@ -0,0 +1,24 @@ +LOAD CSV + FROM inline (rownum, ts [date format 'YYYY-MM-DD HH24-MI-SS.US']) + INTO postgresql:///pgloader?dateformat (rownum, ts) + + WITH truncate, + fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ',' + + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ drop table if exists dateformat; $$, + $$ create table dateformat ( + rownum smallint, + ts timestamptz + ); + $$; + +1,2014-10-02 00-33-12.123456 +2,2014-10-02 00-33-13.123456 +3,2014-10-02 00-33-14.123456 diff --git a/test/regress/expected/csv-parse-date.out b/test/regress/expected/csv-parse-date.out new file mode 100644 index 0000000..e69de29