mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-04 18:36:12 +02:00
Implement a new source level filter: trim.
As seen in #116, it might be better for the users to be able to ask for field trimming right in the source definition, like we do for processing nulls.
This commit is contained in:
parent
8a0c91fa40
commit
ea97fc4659
12
pgloader.1
12
pgloader.1
@ -548,6 +548,12 @@ When \fIblanks\fR is used and the field value that is read contains only space c
|
||||
.IP
|
||||
When a double\-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL \fBNULL\fR value\.
|
||||
.
|
||||
.IP "\(bu" 4
|
||||
\fItrim both whitespace\fR, \fItrim left whitespace\fR, \fItrim right whitespace\fR
|
||||
.
|
||||
.IP
|
||||
This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string\.
|
||||
.
|
||||
.IP "" 0
|
||||
|
||||
.
|
||||
@ -739,6 +745,12 @@ When \fIblanks\fR is used and the field value that is read contains only space c
|
||||
.IP
|
||||
When a double\-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL \fBNULL\fR value\.
|
||||
.
|
||||
.IP "\(bu" 4
|
||||
\fItrim both whitespace\fR, \fItrim left whitespace\fR, \fItrim right whitespace\fR
|
||||
.
|
||||
.IP
|
||||
This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string\.
|
||||
.
|
||||
.IP "" 0
|
||||
|
||||
.
|
||||
|
||||
@ -494,6 +494,12 @@ The `csv` format command accepts the following clauses and options:
|
||||
field value, then the field value is automatically converted to an
|
||||
SQL `NULL` value.
|
||||
|
||||
- *trim both whitespace*, *trim left whitespace*, *trim right whitespace*
|
||||
|
||||
This option allows to trim whitespaces in the read data, either from
|
||||
both sides of the data, or only the whitespace characters found on
|
||||
the left of the streaing, or only those on the right of the string.
|
||||
|
||||
- *WITH*
|
||||
|
||||
When loading from a `CSV` file, the following options are supported:
|
||||
@ -662,6 +668,12 @@ The `fixed` format command accepts the following clauses and options:
|
||||
field value, then the field value is automatically converted to an
|
||||
SQL `NULL` value.
|
||||
|
||||
- *trim both whitespace*, *trim left whitespace*, *trim right whitespace*
|
||||
|
||||
This option allows to trim whitespaces in the read data, either from
|
||||
both sides of the data, or only the whitespace characters found on
|
||||
the left of the streaing, or only those on the right of the string.
|
||||
|
||||
- *WITH*
|
||||
|
||||
When loading from a `CSV` file, the following options are supported:
|
||||
|
||||
@ -115,6 +115,11 @@
|
||||
(def-keyword-rule "terminated")
|
||||
(def-keyword-rule "nullif")
|
||||
(def-keyword-rule "blank")
|
||||
(def-keyword-rule "trim")
|
||||
(def-keyword-rule "both")
|
||||
(def-keyword-rule "left")
|
||||
(def-keyword-rule "right")
|
||||
(def-keyword-rule "whitespace")
|
||||
(def-keyword-rule "skip")
|
||||
(def-keyword-rule "header")
|
||||
(def-keyword-rule "null")
|
||||
@ -1700,9 +1705,21 @@ load database
|
||||
(declare (ignore null if))
|
||||
(cons :null-as opt))))
|
||||
|
||||
(defrule option-trim-both-whitespace (and kw-trim kw-both kw-whitespace)
|
||||
(:constant (cons :trim-both t)))
|
||||
|
||||
(defrule option-trim-left-whitespace (and kw-trim kw-left kw-whitespace)
|
||||
(:constant (cons :trim-left t)))
|
||||
|
||||
(defrule option-trim-right-whitespace (and kw-trim kw-right kw-whitespace)
|
||||
(:constant (cons :trim-right t)))
|
||||
|
||||
(defrule csv-field-option (or option-terminated-by
|
||||
option-date-format
|
||||
option-null-if))
|
||||
option-null-if
|
||||
option-trim-both-whitespace
|
||||
option-trim-left-whitespace
|
||||
option-trim-right-whitespace))
|
||||
|
||||
(defrule csv-field-options (* csv-field-option)
|
||||
(:lambda (options)
|
||||
|
||||
@ -229,17 +229,29 @@
|
||||
(list (pgloader.transforms:intern-symbol (car field-name-or-list)))
|
||||
(t (pgloader.transforms:intern-symbol field-name-or-list))))
|
||||
|
||||
(field-process-null-fn (field-name-or-list)
|
||||
(process-field (field-name-or-list)
|
||||
"Given a field entry, return a function dealing with nulls for it"
|
||||
(destructuring-bind (&key null-as date-format &allow-other-keys)
|
||||
(destructuring-bind (&key null-as
|
||||
date-format
|
||||
trim-both
|
||||
trim-left
|
||||
trim-right
|
||||
&allow-other-keys)
|
||||
(typecase field-name-or-list
|
||||
(list (cdr field-name-or-list))
|
||||
(t (cdr (assoc field-name-or-list fields
|
||||
:test #'string-equal))))
|
||||
(declare (ignore date-format)) ; TODO
|
||||
(if (null null-as)
|
||||
#'identity
|
||||
(null-as-processing-fn null-as)))))
|
||||
;; now prepare a function of a column
|
||||
(lambda (col)
|
||||
(let ((value-or-null
|
||||
(if (null null-as) col
|
||||
(funcall (null-as-processing-fn null-as) col))))
|
||||
(when value-or-null
|
||||
(cond (trim-both (string-trim '(#\Space) value-or-null))
|
||||
(trim-left (string-left-trim '(#\Space) value-or-null))
|
||||
(trim-right (string-right-trim '(#\Space) value-or-null))
|
||||
(t value-or-null))))))))
|
||||
|
||||
(let* ((projection
|
||||
(cond
|
||||
@ -254,7 +266,7 @@
|
||||
;; null-as, or the generic one if none has been given for
|
||||
;; that field.
|
||||
(let ((process-nulls
|
||||
(mapcar (function field-process-null-fn) fields)))
|
||||
(mapcar (function process-field) fields)))
|
||||
`(lambda (row)
|
||||
(let ((v (make-array (length row))))
|
||||
(loop
|
||||
@ -279,7 +291,7 @@
|
||||
(loop for field-name in args
|
||||
collect (list
|
||||
field-name
|
||||
`(funcall ,(field-process-null-fn field-name)
|
||||
`(funcall ,(process-field field-name)
|
||||
,field-name))))
|
||||
(newrow
|
||||
(loop for (name type fn) in columns
|
||||
@ -287,7 +299,7 @@
|
||||
;; we expect the name of a COLUMN to be the same
|
||||
;; as the name of its derived FIELD when we
|
||||
;; don't have any transformation function
|
||||
(or fn `(funcall ,(field-process-null-fn name)
|
||||
(or fn `(funcall ,(process-field name)
|
||||
,(field-name-as-symbol name))))))
|
||||
`(lambda (row)
|
||||
(declare (optimize speed) (type list row))
|
||||
|
||||
@ -12,7 +12,13 @@
|
||||
*/
|
||||
|
||||
LOAD FIXED
|
||||
FROM inline (a 0 10, b 10 8, c 18 8, d 26 17)
|
||||
FROM inline
|
||||
( -- col start length opts
|
||||
a 0 10,
|
||||
b 10 8,
|
||||
c 18 8,
|
||||
d 26 17 null if blanks trim right whitespace
|
||||
)
|
||||
INTO postgresql:///pgloader?fixed
|
||||
(
|
||||
a, b,
|
||||
@ -42,3 +48,5 @@ LOAD FIXED
|
||||
01234567892008052011431250firstline
|
||||
01234562008052115182300left blank-padded
|
||||
12345678902008052208231560another line
|
||||
2345609872014092914371500
|
||||
2345678902014092914371520
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
123456789 2008-05-20 11:43:12.5 firstline
|
||||
123456789 2008-05-20 11:43:12.5 firstline
|
||||
123456 2008-05-21 15:18:23 left blank-padded
|
||||
1234567890 2008-05-22 08:23:15.6 another line
|
||||
1234567890 2008-05-22 08:23:15.6 another line
|
||||
234560987 2014-09-29 14:37:15 \N
|
||||
234567890 2014-09-29 14:37:15.2 \N
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user