From abbc105c414bba39be551607bd10342fdda18f1f Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 21 May 2015 12:54:33 +0200 Subject: [PATCH] Implement CSV headers support. Some CSV files are given with an header line containing the list of their column names, use that when given the option "csv header". Note that when both "skip header" and "csv header" options are used, pgloader first skip as many required lines and then uses the next one as the csv header. Because of temporary failure to install the `ronn` documentation tool, this patch only commits the changes to the source docs and omits to update the man page (pgloader.1). A following patch is intended to be pushed that fixed that. See #236 which is using shell tricks to retrieve the field list from the CSV file itself and motivated this patch to finally get written. --- pgloader.1.md | 6 ++++++ src/parsers/command-csv.lisp | 4 ++++ src/sources/csv/csv.lisp | 28 ++++++++++++++++++++++++++-- test/Makefile | 1 + test/csv-header.load | 25 +++++++++++++++++++++++++ test/regress/expected/csv-header.out | 2 ++ 6 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 test/csv-header.load create mode 100644 test/regress/expected/csv-header.out diff --git a/pgloader.1.md b/pgloader.1.md index ba722fd..4238bf6 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -841,6 +841,12 @@ The `csv` format command accepts the following clauses and options: Takes a numeric value as argument. Instruct pgloader to skip that many lines at the beginning of the input file. + - *csv header* + + Use the first line read after *skip header* as the list of csv field + names to be found in the CSV file, using the same CSV parameters as + for the CSV data. + - *trim unquoted blanks* When reading unquoted values in the `CSV` file, remove the blanks diff --git a/src/parsers/command-csv.lisp b/src/parsers/command-csv.lisp index cad8fde..1162700 100644 --- a/src/parsers/command-csv.lisp +++ b/src/parsers/command-csv.lisp @@ -49,6 +49,9 @@ (bind (((_ _ _ digits) osh)) (cons :skip-lines (parse-integer (text digits)))))) +(defrule option-csv-header (and kw-csv kw-header) + (:constant (cons :csv-header t))) + (defrule option-fields-enclosed-by (and kw-fields (? kw-optionally) kw-enclosed kw-by separator) (:lambda (enc) @@ -95,6 +98,7 @@ option-truncate option-disable-triggers option-skip-header + option-csv-header option-lines-terminated-by option-fields-not-enclosed option-fields-enclosed-by diff --git a/src/sources/csv/csv.lisp b/src/sources/csv/csv.lisp index dcd8f69..8ad9c77 100644 --- a/src/sources/csv/csv.lisp +++ b/src/sources/csv/csv.lisp @@ -39,7 +39,10 @@ :initarg :source-type) ; or :filename (encoding :accessor encoding ; file encoding :initarg :encoding) ; - (skip-lines :accessor skip-lines ; CSV headers + (csv-header :accessor csv-header ; CSV headers are col names + :initarg :csv-header + :initform nil) ; + (skip-lines :accessor skip-lines ; CSV skip firt N lines :initarg :skip-lines ; :initform 0) ; (separator :accessor csv-separator ; CSV separator @@ -78,6 +81,20 @@ ;;; ;;; Read a file format in CSV format, and call given function on each line. ;;; +(defun parse-csv-header (csv header) + "Parse the header line given csv setup." + ;; a field entry is a list of field name and options + (mapcar #'list + (car ; parsing a single line + (cl-csv:read-csv header + :separator (csv-separator csv) + :quote (csv-quote csv) + :escape (csv-escape csv) + :unquoted-empty-string-is-nil t + :quoted-empty-string-is-nil nil + :trim-outer-whitespace (csv-trim-blanks csv) + :newline (csv-newline csv))))) + (defmethod map-rows ((csv copy-csv) &key process-row-fn) "Load data from a text file in CSV format, with support for advanced projecting capabilities. See `project-fields' for details. @@ -114,6 +131,13 @@ ;; about skipping the first line (loop repeat (skip-lines csv) do (read-line input nil nil)) + ;; we might now have to read the CSV fields from the header line + (when (csv-header csv) + (setf (fields csv) + (parse-csv-header csv (read-line input nil nil))) + + (log-message :debug "Parsed header columns ~s" (fields csv))) + ;; read in the text file, split it into columns, process NULL ;; columns the way postmodern expects them, and call ;; PROCESS-ROW-FN on them @@ -153,7 +177,7 @@ (with-stats-collection ((target csv) :dbname (db-name (target-db csv)) :state *state* :summary summary) - (lp:task-handler-bind ((error #'lp:invoke-transfer-error)) + (lp:task-handler-bind () ;; ((error #'lp:invoke-transfer-error)) (log-message :notice "COPY ~a" (target csv)) (lp:submit-task channel #'copy-to-queue csv queue) diff --git a/test/Makefile b/test/Makefile index 36d5f8d..a4cd15c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -10,6 +10,7 @@ REGRESS= allcols.load \ csv-parse-date.load \ csv-error.load \ csv-filename-pattern.load \ + csv-header.load \ csv-keep-extra-blanks.load \ csv-nulls.load \ csv-trim-extra-blanks.load \ diff --git a/test/csv-header.load b/test/csv-header.load new file mode 100644 index 0000000..ccd72d2 --- /dev/null +++ b/test/csv-header.load @@ -0,0 +1,25 @@ +LOAD CSV + FROM INLINE + INTO postgresql://dim@localhost/pgloader?header + + WITH truncate, + fields terminated by ',', + csv header + + BEFORE LOAD DO + $$ drop table if exists header; $$, + $$ CREATE TABLE header + ( + somefields text, + rekplcode text, + "repl$grpid" text, + "repl$id" text, + another text, + fields text + ) + $$; + + +somefields,reklpcode,repl$grpid,repl$id,another,fields +a,b,c,d,e,f +foo,bar,baz,quux,foobar,fizzbuzz diff --git a/test/regress/expected/csv-header.out b/test/regress/expected/csv-header.out new file mode 100644 index 0000000..512042d --- /dev/null +++ b/test/regress/expected/csv-header.out @@ -0,0 +1,2 @@ +a b c d e f +foo bar baz quux foobar fizzbuzz