diff --git a/pgloader.1 b/pgloader.1 index dc61ab5..6fe1f9e 100644 --- a/pgloader.1 +++ b/pgloader.1 @@ -548,6 +548,12 @@ When \fIblanks\fR is used and the field value that is read contains only space c .IP When a double\-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL \fBNULL\fR value\. . +.IP "\(bu" 4 +\fItrim both whitespace\fR, \fItrim left whitespace\fR, \fItrim right whitespace\fR +. +.IP +This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string\. +. .IP "" 0 . @@ -739,6 +745,12 @@ When \fIblanks\fR is used and the field value that is read contains only space c .IP When a double\-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL \fBNULL\fR value\. . +.IP "\(bu" 4 +\fItrim both whitespace\fR, \fItrim left whitespace\fR, \fItrim right whitespace\fR +. +.IP +This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string\. +. .IP "" 0 . diff --git a/pgloader.1.md b/pgloader.1.md index 02b0f7b..cb30d63 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -494,6 +494,12 @@ The `csv` format command accepts the following clauses and options: field value, then the field value is automatically converted to an SQL `NULL` value. + - *trim both whitespace*, *trim left whitespace*, *trim right whitespace* + + This option allows to trim whitespaces in the read data, either from + both sides of the data, or only the whitespace characters found on + the left of the streaing, or only those on the right of the string. + - *WITH* When loading from a `CSV` file, the following options are supported: @@ -662,6 +668,12 @@ The `fixed` format command accepts the following clauses and options: field value, then the field value is automatically converted to an SQL `NULL` value. + - *trim both whitespace*, *trim left whitespace*, *trim right whitespace* + + This option allows to trim whitespaces in the read data, either from + both sides of the data, or only the whitespace characters found on + the left of the streaing, or only those on the right of the string. + - *WITH* When loading from a `CSV` file, the following options are supported: diff --git a/src/parser.lisp b/src/parser.lisp index 1beb789..4432143 100644 --- a/src/parser.lisp +++ b/src/parser.lisp @@ -115,6 +115,11 @@ (def-keyword-rule "terminated") (def-keyword-rule "nullif") (def-keyword-rule "blank") + (def-keyword-rule "trim") + (def-keyword-rule "both") + (def-keyword-rule "left") + (def-keyword-rule "right") + (def-keyword-rule "whitespace") (def-keyword-rule "skip") (def-keyword-rule "header") (def-keyword-rule "null") @@ -1700,9 +1705,21 @@ load database (declare (ignore null if)) (cons :null-as opt)))) +(defrule option-trim-both-whitespace (and kw-trim kw-both kw-whitespace) + (:constant (cons :trim-both t))) + +(defrule option-trim-left-whitespace (and kw-trim kw-left kw-whitespace) + (:constant (cons :trim-left t))) + +(defrule option-trim-right-whitespace (and kw-trim kw-right kw-whitespace) + (:constant (cons :trim-right t))) + (defrule csv-field-option (or option-terminated-by option-date-format - option-null-if)) + option-null-if + option-trim-both-whitespace + option-trim-left-whitespace + option-trim-right-whitespace)) (defrule csv-field-options (* csv-field-option) (:lambda (options) diff --git a/src/sources/sources.lisp b/src/sources/sources.lisp index 3d7164c..1512dcc 100644 --- a/src/sources/sources.lisp +++ b/src/sources/sources.lisp @@ -229,17 +229,29 @@ (list (pgloader.transforms:intern-symbol (car field-name-or-list))) (t (pgloader.transforms:intern-symbol field-name-or-list)))) - (field-process-null-fn (field-name-or-list) + (process-field (field-name-or-list) "Given a field entry, return a function dealing with nulls for it" - (destructuring-bind (&key null-as date-format &allow-other-keys) + (destructuring-bind (&key null-as + date-format + trim-both + trim-left + trim-right + &allow-other-keys) (typecase field-name-or-list (list (cdr field-name-or-list)) (t (cdr (assoc field-name-or-list fields :test #'string-equal)))) (declare (ignore date-format)) ; TODO - (if (null null-as) - #'identity - (null-as-processing-fn null-as))))) + ;; now prepare a function of a column + (lambda (col) + (let ((value-or-null + (if (null null-as) col + (funcall (null-as-processing-fn null-as) col)))) + (when value-or-null + (cond (trim-both (string-trim '(#\Space) value-or-null)) + (trim-left (string-left-trim '(#\Space) value-or-null)) + (trim-right (string-right-trim '(#\Space) value-or-null)) + (t value-or-null)))))))) (let* ((projection (cond @@ -254,7 +266,7 @@ ;; null-as, or the generic one if none has been given for ;; that field. (let ((process-nulls - (mapcar (function field-process-null-fn) fields))) + (mapcar (function process-field) fields))) `(lambda (row) (let ((v (make-array (length row)))) (loop @@ -279,7 +291,7 @@ (loop for field-name in args collect (list field-name - `(funcall ,(field-process-null-fn field-name) + `(funcall ,(process-field field-name) ,field-name)))) (newrow (loop for (name type fn) in columns @@ -287,7 +299,7 @@ ;; we expect the name of a COLUMN to be the same ;; as the name of its derived FIELD when we ;; don't have any transformation function - (or fn `(funcall ,(field-process-null-fn name) + (or fn `(funcall ,(process-field name) ,(field-name-as-symbol name)))))) `(lambda (row) (declare (optimize speed) (type list row)) diff --git a/test/fixed.load b/test/fixed.load index 3dfb568..3f4c9d8 100644 --- a/test/fixed.load +++ b/test/fixed.load @@ -12,7 +12,13 @@ */ LOAD FIXED - FROM inline (a 0 10, b 10 8, c 18 8, d 26 17) + FROM inline + ( -- col start length opts + a 0 10, + b 10 8, + c 18 8, + d 26 17 null if blanks trim right whitespace + ) INTO postgresql:///pgloader?fixed ( a, b, @@ -42,3 +48,5 @@ LOAD FIXED 01234567892008052011431250firstline 01234562008052115182300left blank-padded 12345678902008052208231560another line + 2345609872014092914371500 + 2345678902014092914371520 diff --git a/test/regress/expected/fixed.out b/test/regress/expected/fixed.out index 5cf7e12..b8e61fa 100644 --- a/test/regress/expected/fixed.out +++ b/test/regress/expected/fixed.out @@ -1,3 +1,5 @@ -123456789 2008-05-20 11:43:12.5 firstline +123456789 2008-05-20 11:43:12.5 firstline 123456 2008-05-21 15:18:23 left blank-padded -1234567890 2008-05-22 08:23:15.6 another line +1234567890 2008-05-22 08:23:15.6 another line +234560987 2014-09-29 14:37:15 \N +234567890 2014-09-29 14:37:15.2 \N