From dae5dec03c3618050ebbfaef788eedb00ea3269e Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 15 Jan 2019 22:39:08 +0100 Subject: [PATCH] Allow fields/columns projections when parsing header. When using a CSV header, we might find fields in a different order than the target table columns, and maybe not all of the fields are going to be read. Take account of the header we read rather than expecting the header to look like the target table definition. Fix #888. --- src/load/load-file.lisp | 2 ++ src/sources/common/api.lisp | 2 +- src/sources/common/md-methods.lisp | 10 +++------ src/sources/csv/csv.lisp | 32 ++++++++++++++++++---------- test/csv-header.load | 8 +++---- test/regress/expected/csv-header.out | 4 ++-- 6 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/load/load-file.lisp b/src/load/load-file.lisp index 02ddb45..de525dc 100644 --- a/src/load/load-file.lisp +++ b/src/load/load-file.lisp @@ -98,6 +98,8 @@ (loop :for path-spec :in path-list :count t :do (let ((table-source (clone-copy-for copy path-spec))) + (when (and (header table-source) (null (fields table-source))) + (parse-header table-source)) (incf task-count (copy-from table-source :concurrency concurrency diff --git a/src/sources/common/api.lisp b/src/sources/common/api.lisp index 091e3b5..d0d1a38 100644 --- a/src/sources/common/api.lisp +++ b/src/sources/common/api.lisp @@ -95,7 +95,7 @@ :initform nil)) ; (:documentation "pgloader Multiple Files Data Source (csv, fixed, copy).")) -(defgeneric parse-header (md-copy header) +(defgeneric parse-header (md-copy) (:documentation "Parse the file header and return a list of fields.")) (defgeneric process-rows (md-copy stream process-fn) diff --git a/src/sources/common/md-methods.lisp b/src/sources/common/md-methods.lisp index 8694dd1..0a8eacc 100644 --- a/src/sources/common/md-methods.lisp +++ b/src/sources/common/md-methods.lisp @@ -4,7 +4,7 @@ (in-package #:pgloader.sources) -(defmethod parse-header ((copy md-copy) header) +(defmethod parse-header ((copy md-copy)) "Unsupported by default, to be implemented in each md-copy subclass." (error "Parsing the header of a ~s is not implemented yet." (type-of copy))) @@ -59,12 +59,8 @@ ;; about skipping the first line (loop :repeat (skip-lines copy) :do (read-line input nil nil)) - ;; we might now have to read the fields from the header line - (when (header copy) - (setf (fields copy) - (parse-header copy (read-line input nil nil))) - - (log-message :debug "Parsed header columns ~s" (fields copy))) + ;; we might now have to skip the header line + (when (header copy) (read-line input nil nil)) ;; read in the text file, split it into columns (process-rows copy input process-row-fn)))) diff --git a/src/sources/csv/csv.lisp b/src/sources/csv/csv.lisp index f58007c..4fc6495 100644 --- a/src/sources/csv/csv.lisp +++ b/src/sources/csv/csv.lisp @@ -57,19 +57,29 @@ ;;; ;;; Read a file format in CSV format, and call given function on each line. ;;; -(defmethod parse-header ((csv copy-csv) header) +(defmethod parse-header ((csv copy-csv)) "Parse the header line given csv setup." ;; a field entry is a list of field name and options - (mapcar #'list - (car ; parsing a single line - (cl-csv:read-csv header - :separator (csv-separator csv) - :quote (csv-quote csv) - :escape (csv-escape csv) - :unquoted-empty-string-is-nil t - :quoted-empty-string-is-nil nil - :trim-outer-whitespace (csv-trim-blanks csv) - :newline (csv-newline csv))))) + (with-connection (cnx (source csv) + :direction :input + :external-format (encoding csv) + :if-does-not-exist nil) + (let ((input (md-strm cnx))) + (loop :repeat (skip-lines csv) :do (read-line input nil nil)) + (let* ((header-line (read-line input nil nil)) + (field-name-list + (mapcar #'list ; we need each field to be a list + (car ; parsing a single line + (cl-csv:read-csv header-line + :separator (csv-separator csv) + :quote (csv-quote csv) + :escape (csv-escape csv) + :unquoted-empty-string-is-nil t + :quoted-empty-string-is-nil nil + :trim-outer-whitespace (csv-trim-blanks csv) + :newline (csv-newline csv)))))) + (log-message :notice "Parsed header columns ~s" (fields csv)) + (setf (fields csv) field-name-list ))))) (defmethod process-rows ((csv copy-csv) stream process-fn) "Process rows from STREAM according to COPY specifications and PROCESS-FN." diff --git a/test/csv-header.load b/test/csv-header.load index a8b32eb..45b75de 100644 --- a/test/csv-header.load +++ b/test/csv-header.load @@ -15,11 +15,11 @@ LOAD CSV "repl$grpid" text, "repl$id" text, another text, - fields text + fields integer ) $$; -somefields,rekplcode,repl$grpid,repl$id,another,fields -a,b,c,d,e,f -foo,bar,baz,quux,foobar,fizzbuzz +somefields,rekplcode,repl$grpid,repl$id,fields,another +a,b,c,d,1,e +foo,bar,baz,quux,2,foobar diff --git a/test/regress/expected/csv-header.out b/test/regress/expected/csv-header.out index 512042d..71c7ec2 100644 --- a/test/regress/expected/csv-header.out +++ b/test/regress/expected/csv-header.out @@ -1,2 +1,2 @@ -a b c d e f -foo bar baz quux foobar fizzbuzz +a b c d e 1 +foo bar baz quux foobar 2