Implement CSV headers support.

Some CSV files are given with an header line containing the list of
their column names, use that when given the option "csv header".

Note that when both "skip header" and "csv header" options are used,
pgloader first skip as many required lines and then uses the next one as
the csv header.

Because of temporary failure to install the `ronn` documentation tool,
this patch only commits the changes to the source docs and omits to
update the man page (pgloader.1). A following patch is intended to be
pushed that fixed that.

See #236 which is using shell tricks to retrieve the field list from the
CSV file itself and motivated this patch to finally get written.
This commit is contained in:
Dimitri Fontaine 2015-05-21 12:54:33 +02:00
parent dfb4cc2049
commit abbc105c41
6 changed files with 64 additions and 2 deletions

View File

@ -841,6 +841,12 @@ The `csv` format command accepts the following clauses and options:
Takes a numeric value as argument. Instruct pgloader to skip that
many lines at the beginning of the input file.
- *csv header*
Use the first line read after *skip header* as the list of csv field
names to be found in the CSV file, using the same CSV parameters as
for the CSV data.
- *trim unquoted blanks*
When reading unquoted values in the `CSV` file, remove the blanks

View File

@ -49,6 +49,9 @@
(bind (((_ _ _ digits) osh))
(cons :skip-lines (parse-integer (text digits))))))
(defrule option-csv-header (and kw-csv kw-header)
(:constant (cons :csv-header t)))
(defrule option-fields-enclosed-by
(and kw-fields (? kw-optionally) kw-enclosed kw-by separator)
(:lambda (enc)
@ -95,6 +98,7 @@
option-truncate
option-disable-triggers
option-skip-header
option-csv-header
option-lines-terminated-by
option-fields-not-enclosed
option-fields-enclosed-by

View File

@ -39,7 +39,10 @@
:initarg :source-type) ; or :filename
(encoding :accessor encoding ; file encoding
:initarg :encoding) ;
(skip-lines :accessor skip-lines ; CSV headers
(csv-header :accessor csv-header ; CSV headers are col names
:initarg :csv-header
:initform nil) ;
(skip-lines :accessor skip-lines ; CSV skip firt N lines
:initarg :skip-lines ;
:initform 0) ;
(separator :accessor csv-separator ; CSV separator
@ -78,6 +81,20 @@
;;;
;;; Read a file format in CSV format, and call given function on each line.
;;;
(defun parse-csv-header (csv header)
"Parse the header line given csv setup."
;; a field entry is a list of field name and options
(mapcar #'list
(car ; parsing a single line
(cl-csv:read-csv header
:separator (csv-separator csv)
:quote (csv-quote csv)
:escape (csv-escape csv)
:unquoted-empty-string-is-nil t
:quoted-empty-string-is-nil nil
:trim-outer-whitespace (csv-trim-blanks csv)
:newline (csv-newline csv)))))
(defmethod map-rows ((csv copy-csv) &key process-row-fn)
"Load data from a text file in CSV format, with support for advanced
projecting capabilities. See `project-fields' for details.
@ -114,6 +131,13 @@
;; about skipping the first line
(loop repeat (skip-lines csv) do (read-line input nil nil))
;; we might now have to read the CSV fields from the header line
(when (csv-header csv)
(setf (fields csv)
(parse-csv-header csv (read-line input nil nil)))
(log-message :debug "Parsed header columns ~s" (fields csv)))
;; read in the text file, split it into columns, process NULL
;; columns the way postmodern expects them, and call
;; PROCESS-ROW-FN on them
@ -153,7 +177,7 @@
(with-stats-collection ((target csv)
:dbname (db-name (target-db csv))
:state *state* :summary summary)
(lp:task-handler-bind ((error #'lp:invoke-transfer-error))
(lp:task-handler-bind () ;; ((error #'lp:invoke-transfer-error))
(log-message :notice "COPY ~a" (target csv))
(lp:submit-task channel #'copy-to-queue csv queue)

View File

@ -10,6 +10,7 @@ REGRESS= allcols.load \
csv-parse-date.load \
csv-error.load \
csv-filename-pattern.load \
csv-header.load \
csv-keep-extra-blanks.load \
csv-nulls.load \
csv-trim-extra-blanks.load \

25
test/csv-header.load Normal file
View File

@ -0,0 +1,25 @@
LOAD CSV
FROM INLINE
INTO postgresql://dim@localhost/pgloader?header
WITH truncate,
fields terminated by ',',
csv header
BEFORE LOAD DO
$$ drop table if exists header; $$,
$$ CREATE TABLE header
(
somefields text,
rekplcode text,
"repl$grpid" text,
"repl$id" text,
another text,
fields text
)
$$;
somefields,reklpcode,repl$grpid,repl$id,another,fields
a,b,c,d,e,f
foo,bar,baz,quux,foobar,fizzbuzz

View File

@ -0,0 +1,2 @@
a b c d e f
foo bar baz quux foobar fizzbuzz