Implement a new source level filter: trim.

As seen in #116, it might be better for the users to be able to ask for
field trimming right in the source definition, like we do for processing
nulls.
This commit is contained in:
Dimitri Fontaine 2014-09-29 15:16:04 +02:00
parent 8a0c91fa40
commit ea97fc4659
6 changed files with 75 additions and 12 deletions

View File

@ -548,6 +548,12 @@ When \fIblanks\fR is used and the field value that is read contains only space c
.IP
When a double\-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL \fBNULL\fR value\.
.
.IP "\(bu" 4
\fItrim both whitespace\fR, \fItrim left whitespace\fR, \fItrim right whitespace\fR
.
.IP
This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string\.
.
.IP "" 0
.
@ -739,6 +745,12 @@ When \fIblanks\fR is used and the field value that is read contains only space c
.IP
When a double\-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL \fBNULL\fR value\.
.
.IP "\(bu" 4
\fItrim both whitespace\fR, \fItrim left whitespace\fR, \fItrim right whitespace\fR
.
.IP
This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string\.
.
.IP "" 0
.

View File

@ -494,6 +494,12 @@ The `csv` format command accepts the following clauses and options:
field value, then the field value is automatically converted to an
SQL `NULL` value.
- *trim both whitespace*, *trim left whitespace*, *trim right whitespace*
This option allows to trim whitespaces in the read data, either from
both sides of the data, or only the whitespace characters found on
the left of the streaing, or only those on the right of the string.
- *WITH*
When loading from a `CSV` file, the following options are supported:
@ -662,6 +668,12 @@ The `fixed` format command accepts the following clauses and options:
field value, then the field value is automatically converted to an
SQL `NULL` value.
- *trim both whitespace*, *trim left whitespace*, *trim right whitespace*
This option allows to trim whitespaces in the read data, either from
both sides of the data, or only the whitespace characters found on
the left of the streaing, or only those on the right of the string.
- *WITH*
When loading from a `CSV` file, the following options are supported:

View File

@ -115,6 +115,11 @@
(def-keyword-rule "terminated")
(def-keyword-rule "nullif")
(def-keyword-rule "blank")
(def-keyword-rule "trim")
(def-keyword-rule "both")
(def-keyword-rule "left")
(def-keyword-rule "right")
(def-keyword-rule "whitespace")
(def-keyword-rule "skip")
(def-keyword-rule "header")
(def-keyword-rule "null")
@ -1700,9 +1705,21 @@ load database
(declare (ignore null if))
(cons :null-as opt))))
(defrule option-trim-both-whitespace (and kw-trim kw-both kw-whitespace)
(:constant (cons :trim-both t)))
(defrule option-trim-left-whitespace (and kw-trim kw-left kw-whitespace)
(:constant (cons :trim-left t)))
(defrule option-trim-right-whitespace (and kw-trim kw-right kw-whitespace)
(:constant (cons :trim-right t)))
(defrule csv-field-option (or option-terminated-by
option-date-format
option-null-if))
option-null-if
option-trim-both-whitespace
option-trim-left-whitespace
option-trim-right-whitespace))
(defrule csv-field-options (* csv-field-option)
(:lambda (options)

View File

@ -229,17 +229,29 @@
(list (pgloader.transforms:intern-symbol (car field-name-or-list)))
(t (pgloader.transforms:intern-symbol field-name-or-list))))
(field-process-null-fn (field-name-or-list)
(process-field (field-name-or-list)
"Given a field entry, return a function dealing with nulls for it"
(destructuring-bind (&key null-as date-format &allow-other-keys)
(destructuring-bind (&key null-as
date-format
trim-both
trim-left
trim-right
&allow-other-keys)
(typecase field-name-or-list
(list (cdr field-name-or-list))
(t (cdr (assoc field-name-or-list fields
:test #'string-equal))))
(declare (ignore date-format)) ; TODO
(if (null null-as)
#'identity
(null-as-processing-fn null-as)))))
;; now prepare a function of a column
(lambda (col)
(let ((value-or-null
(if (null null-as) col
(funcall (null-as-processing-fn null-as) col))))
(when value-or-null
(cond (trim-both (string-trim '(#\Space) value-or-null))
(trim-left (string-left-trim '(#\Space) value-or-null))
(trim-right (string-right-trim '(#\Space) value-or-null))
(t value-or-null))))))))
(let* ((projection
(cond
@ -254,7 +266,7 @@
;; null-as, or the generic one if none has been given for
;; that field.
(let ((process-nulls
(mapcar (function field-process-null-fn) fields)))
(mapcar (function process-field) fields)))
`(lambda (row)
(let ((v (make-array (length row))))
(loop
@ -279,7 +291,7 @@
(loop for field-name in args
collect (list
field-name
`(funcall ,(field-process-null-fn field-name)
`(funcall ,(process-field field-name)
,field-name))))
(newrow
(loop for (name type fn) in columns
@ -287,7 +299,7 @@
;; we expect the name of a COLUMN to be the same
;; as the name of its derived FIELD when we
;; don't have any transformation function
(or fn `(funcall ,(field-process-null-fn name)
(or fn `(funcall ,(process-field name)
,(field-name-as-symbol name))))))
`(lambda (row)
(declare (optimize speed) (type list row))

View File

@ -12,7 +12,13 @@
*/
LOAD FIXED
FROM inline (a 0 10, b 10 8, c 18 8, d 26 17)
FROM inline
( -- col start length opts
a 0 10,
b 10 8,
c 18 8,
d 26 17 null if blanks trim right whitespace
)
INTO postgresql:///pgloader?fixed
(
a, b,
@ -42,3 +48,5 @@ LOAD FIXED
01234567892008052011431250firstline
01234562008052115182300left blank-padded
12345678902008052208231560another line
2345609872014092914371500
2345678902014092914371520

View File

@ -1,3 +1,5 @@
123456789 2008-05-20 11:43:12.5 firstline
123456789 2008-05-20 11:43:12.5 firstline
123456 2008-05-21 15:18:23 left blank-padded
1234567890 2008-05-22 08:23:15.6 another line
1234567890 2008-05-22 08:23:15.6 another line
234560987 2014-09-29 14:37:15 \N
234567890 2014-09-29 14:37:15.2 \N