Implement the source date format option.

This commit is contained in:
Dimitri Fontaine 2014-10-02 01:00:03 +02:00
parent dfb1e9355a
commit 7cf7e714fc
9 changed files with 319 additions and 18 deletions

View File

@ -534,7 +534,56 @@ The processing of this option is not currently implemented\.
When the field is expected of the date type, then this option allows to specify the date format used in the file\.
.
.IP
The processing of this option is not currently implemented\.
Date format string are template strings modeled against the PostgreSQL \fBto_char\fR template strings support, limited to the following patterns:
.
.IP "\(bu" 4
YYYY, YYY, YY for the year part
.
.IP "\(bu" 4
MM for the numeric month part
.
.IP "\(bu" 4
DD for the numeric day part
.
.IP "\(bu" 4
HH, HH12, HH24 for the hour part
.
.IP "\(bu" 4
am, AM, a\.m\., A\.M\.
.
.IP "\(bu" 4
pm, PM, p\.m\., P\.M\.
.
.IP "\(bu" 4
MI for the minutes part
.
.IP "\(bu" 4
SS for the seconds part
.
.IP "\(bu" 4
MS for the milliseconds part (4 digits)
.
.IP "\(bu" 4
US for the microseconds part (6 digits)
.
.IP "\(bu" 4
unparsed punctuation signs: \- \. * # @ T / \e and space
.
.IP "" 0
.
.IP
Here\'s an example of a \fIdate format\fR specification:
.
.IP "" 4
.
.nf
column\-name [date format \'YYYY\-MM\-DD HH24\-MI\-SS\.US\']
.
.fi
.
.IP "" 0
.
.IP "\(bu" 4
\fInull if\fR
@ -744,7 +793,56 @@ The processing of this option is not currently implemented\.
When the field is expected of the date type, then this option allows to specify the date format used in the file\.
.
.IP
The processing of this option is not currently implemented\.
Date format string are template strings modeled against the PostgreSQL \fBto_char\fR template strings support, limited to the following patterns:
.
.IP "\(bu" 4
YYYY, YYY, YY for the year part
.
.IP "\(bu" 4
MM for the numeric month part
.
.IP "\(bu" 4
DD for the numeric day part
.
.IP "\(bu" 4
HH, HH12, HH24 for the hour part
.
.IP "\(bu" 4
am, AM, a\.m\., A\.M\.
.
.IP "\(bu" 4
pm, PM, p\.m\., P\.M\.
.
.IP "\(bu" 4
MI for the minutes part
.
.IP "\(bu" 4
SS for the seconds part
.
.IP "\(bu" 4
MS for the milliseconds part (4 digits)
.
.IP "\(bu" 4
US for the microseconds part (6 digits)
.
.IP "\(bu" 4
unparsed punctuation signs: \- \. * # @ T / \e and space
.
.IP "" 0
.
.IP
Here\'s an example of a \fIdate format\fR specification:
.
.IP "" 4
.
.nf
column\-name [date format \'YYYY\-MM\-DD HH24\-MI\-SS\.US\']
.
.fi
.
.IP "" 0
.
.IP "\(bu" 4
\fInull if\fR

View File

@ -479,7 +479,25 @@ The `csv` format command accepts the following clauses and options:
When the field is expected of the date type, then this option allows
to specify the date format used in the file.
The processing of this option is not currently implemented.
Date format string are template strings modeled against the
PostgreSQL `to_char` template strings support, limited to the
following patterns:
- YYYY, YYY, YY for the year part
- MM for the numeric month part
- DD for the numeric day part
- HH, HH12, HH24 for the hour part
- am, AM, a.m., A.M.
- pm, PM, p.m., P.M.
- MI for the minutes part
- SS for the seconds part
- MS for the milliseconds part (4 digits)
- US for the microseconds part (6 digits)
- unparsed punctuation signs: - . * # @ T / \ and space
Here's an example of a *date format* specification:
column-name [date format 'YYYY-MM-DD HH24-MI-SS.US']
- *null if*
@ -664,7 +682,25 @@ The `fixed` format command accepts the following clauses and options:
When the field is expected of the date type, then this option allows
to specify the date format used in the file.
The processing of this option is not currently implemented.
Date format string are template strings modeled against the
PostgreSQL `to_char` template strings support, limited to the
following patterns:
- YYYY, YYY, YY for the year part
- MM for the numeric month part
- DD for the numeric day part
- HH, HH12, HH24 for the hour part
- am, AM, a.m., A.M.
- pm, PM, p.m., P.M.
- MI for the minutes part
- SS for the seconds part
- MS for the milliseconds part (4 digits)
- US for the microseconds part (6 digits)
- unparsed punctuation signs: - . * # @ T / \ and space
Here's an example of a *date format* specification:
column-name [date format 'YYYY-MM-DD HH24-MI-SS.US']
- *null if*

View File

@ -42,13 +42,13 @@
:depends-on ("package" "params")
:components
((:file "charsets")
(:file "archive")
(:file "threads")
(:file "logs")
(:file "monitor" :depends-on ("logs"))
(:file "state")
(:file "report" :depends-on ("state"))
(:file "utils" :depends-on ("charsets" "monitor"))
(:file "archive" :depends-on ("logs"))
;; those are one-package-per-file
(:file "transforms")
@ -70,12 +70,13 @@
:depends-on ("params" "package" "utils" "pgsql")
:components
((:file "parse-ini")
(:file "parser")))
(:file "parser")
(:file "date-format")))
;; generic API for Sources
(:file "sources-api"
:pathname "sources"
:depends-on ("params" "package" "utils"))
:depends-on ("params" "package" "utils" "parsers"))
;; Source format specific implementations
(:module sources

View File

@ -54,7 +54,6 @@
#:report-summary
#:report-full-summary
#:with-stats-collection
#:slurp-file-into-string
#:camelCase-to-colname
#:make-kernel
#:list-encodings-and-aliases
@ -96,8 +95,16 @@
#:format-pgsql-create-index
#:create-indexes-in-kernel))
(defpackage #:pgloader.parse-date
(:use #:cl #:esrap)
(:export #:parse-date-string
#:parse-date-format))
(defpackage #:pgloader.sources
(:use #:cl #:pgloader.params #:pgloader.utils)
(:import-from #:pgloader.parse-date
#:parse-date-string
#:parse-date-format)
(:export #:copy
#:source-db
#:target-db

View File

@ -0,0 +1,125 @@
;;;
;;; Parse PostgreSQL to_char() date format strings and transform them into a
;;; lisp date format that the parse-date-string function knows how to deal
;;; with.
;;;
;;; http://www.postgresql.org/docs/current/static/functions-formatting.html#FUNCTIONS-FORMATTING-DATETIME-TABLE
;;;
(in-package #:pgloader.parse-date)
(defvar *century* 2000)
(defun parse-date-string
(date-string
&optional (format '((:year 0 4)
(:month 4 6)
(:day 6 8)
(:hour 8 10)
(:minute 10 12)
(:seconds 12 14)
(:msecs nil nil)
(:usecs nil nil)
(:am nil nil)
(:pm nil nil))))
"Apply this function when input date in like '20041002152952'"
;; only process non-zero dates
(declare (type (or null string) date-string))
(cond ((null date-string) nil)
((string= date-string "") nil)
(t
(destructuring-bind (&key year month day hour minute seconds
am pm
msecs usecs
&allow-other-keys)
(loop
:for (name start end) :in format
:when (and start end)
:append (list name (subseq date-string start end)))
(if (or (string= year "0000")
(string= month "00")
(string= day "00"))
nil
(with-output-to-string (s)
(format s "~a-~a-~a ~a:~a:~a"
(let ((yint (parse-integer year)))
(if (< yint *century*) (+ *century* yint) yint))
month
day
(let ((hint (parse-integer hour)))
(cond ((and am (= hint 12)) "00")
((and pm (= hint 12)) "12")
((and pm (< hint 12)) (+ hint 12))
(t hour)))
minute
seconds)
(if usecs (format s ".~a" usecs)
(when msecs (format s ".~a" msecs)))))))))
;;;
;;; Parse the date format
;;; YYYY-MM-DD HH24-MI-SS
;;;
(defvar *offset* 0)
(defun parse-date-format (format-string)
"Parse a given format string and return a format specification for
parse-date-string"
(let ((*offset* 0))
(remove nil (parse 'format-string format-string))))
(defrule format-string (* (or year
month
day
hour
ampm
minute
seconds
milliseconds
microseconds
noise)))
(defrule noise (+ (or #\- #\. #\Space #\* #\# #\@ #\/ #\\ "T"))
(:lambda (x) (incf *offset* (length (text x))) nil))
(defrule year (or year4 year3 year2))
(defrule year4 "YYYY" (:lambda (x) (build-format-spec x :year 4)))
(defrule year3 "YYY" (:lambda (x) (build-format-spec x :year 3)))
(defrule year2 "YY" (:lambda (x) (build-format-spec x :year 2)))
(defrule month mm)
(defrule mm "MM" (:lambda (x) (build-format-spec x :month 2)))
(defrule day dd)
(defrule dd "DD" (:lambda (x) (build-format-spec x :day 2)))
(defrule hour (or hh24 hh12 hh))
(defrule hh "HH" (:lambda (x) (build-format-spec x :hour 2)))
(defrule hh12 "HH12" (:lambda (x) (build-format-spec x :hour 2)))
(defrule hh24 "HH24" (:lambda (x) (build-format-spec x :hour 2)))
(defrule ampm (or am pm am-dot pm-dot))
(defrule am (or "AM" "am" ) (:lambda (x) (build-format-spec x :am 2)))
(defrule am-dot (or "a.m." "A.M.") (:lambda (x) (build-format-spec x :am 4)))
(defrule pm (or "PM" "pm") (:lambda (x) (build-format-spec x :pm 2)))
(defrule pm-dot (or "p.m." "P.M.") (:lambda (x) (build-format-spec x :pm 4)))
(defrule minute mi)
(defrule mi "MI" (:lambda (x) (build-format-spec x :minute 2)))
(defrule seconds ss)
(defrule ss "SS" (:lambda (x) (build-format-spec x :seconds 2)))
(defrule milliseconds ms)
(defrule ms "MS" (:lambda (x) (build-format-spec x :msecs 4)))
(defrule microseconds us)
(defrule us "US" (:lambda (x) (build-format-spec x :usecs 6)))
(defun build-format-spec (format-string part len)
(declare (ignore format-string))
(prog1
(list part *offset* (+ *offset* len))
(incf *offset* len)))

View File

@ -248,10 +248,19 @@
(if (null null-as) col
(funcall (null-as-processing-fn null-as) col))))
(when value-or-null
(cond (trim-both (string-trim '(#\Space) value-or-null))
(trim-left (string-left-trim '(#\Space) value-or-null))
(trim-right (string-right-trim '(#\Space) value-or-null))
(t value-or-null))))))))
(let ((value-or-null
(cond (trim-both
(string-trim '(#\Space) value-or-null))
(trim-left
(string-left-trim '(#\Space) value-or-null))
(trim-right
(string-right-trim '(#\Space) value-or-null))
(t value-or-null))))
;; now apply the date format, when given
(if date-format
(parse-date-string value-or-null
(parse-date-format date-format))
value-or-null))))))))
(let* ((projection
(cond
@ -269,12 +278,12 @@
(mapcar (function process-field) fields)))
`(lambda (row)
(let ((v (make-array (length row))))
(loop
:for i :from 0
:for col :in row
:for fn :in ',process-nulls
:do (setf (aref v i) (funcall fn col)))
v))))
(loop
:for i :from 0
:for col :in row
:for fn :in ',process-nulls
:do (setf (aref v i) (funcall fn col)))
v))))
(t
;; project some number of FIELDS into a possibly different

View File

@ -7,6 +7,7 @@ LOCAL = $(filter-out $(REMOTE:.load=.out),$(OUT))
REGRESS= allcols.load \
csv-before-after.load \
csv-districts.load \
csv-parse-date.load \
csv-error.load \
csv-filename-pattern.load \
csv-keep-extra-blanks.load \

24
test/csv-parse-date.load Normal file
View File

@ -0,0 +1,24 @@
LOAD CSV
FROM inline (rownum, ts [date format 'YYYY-MM-DD HH24-MI-SS.US'])
INTO postgresql:///pgloader?dateformat (rownum, ts)
WITH truncate,
fields optionally enclosed by '"',
fields escaped by double-quote,
fields terminated by ','
SET client_encoding to 'latin1',
work_mem to '12MB',
standard_conforming_strings to 'on'
BEFORE LOAD DO
$$ drop table if exists dateformat; $$,
$$ create table dateformat (
rownum smallint,
ts timestamptz
);
$$;
1,2014-10-02 00-33-12.123456
2,2014-10-02 00-33-13.123456
3,2014-10-02 00-33-14.123456

View File