From 89aaabd179b2b58bdec052001d13074107163995 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sun, 13 Oct 2013 22:10:07 +0200 Subject: [PATCH] Port tests from pgloader 2.x, implement trailing-sep. --- parse-ini.lisp | 115 +++++++++++++++++----- parser.lisp | 4 +- test/README.md | 4 + test/allcols.load | 32 ++++++ test/csv.load | 49 +++++++-- test/errors.load | 31 ++++++ test/{ => parse}/csv-with-projection.load | 0 test/parse/csv.load | 9 ++ test/{ => parse}/database.load | 0 test/{ => parse}/hans.goeuro.load | 0 test/{ => parse}/messages.load | 0 test/{ => parse}/mix.load | 0 test/{ => parse}/my.load | 0 test/partial.load | 33 +++++++ test/reformat.load | 31 ++++++ test/serial.load | 31 ++++++ test/simple.load | 36 +++++++ test/udc.load | 34 +++++++ test/xzero.load | Bin 0 -> 870 bytes 19 files changed, 377 insertions(+), 32 deletions(-) create mode 100644 test/README.md create mode 100644 test/allcols.load create mode 100644 test/errors.load rename test/{ => parse}/csv-with-projection.load (100%) create mode 100644 test/parse/csv.load rename test/{ => parse}/database.load (100%) rename test/{ => parse}/hans.goeuro.load (100%) rename test/{ => parse}/messages.load (100%) rename test/{ => parse}/mix.load (100%) rename test/{ => parse}/my.load (100%) create mode 100644 test/partial.load create mode 100644 test/reformat.load create mode 100644 test/serial.load create mode 100644 test/simple.load create mode 100644 test/udc.load create mode 100644 test/xzero.load diff --git a/parse-ini.lisp b/parse-ini.lisp index cd9ef19..f86139d 100644 --- a/parse-ini.lisp +++ b/parse-ini.lisp @@ -57,14 +57,14 @@ :template template))) (datestyle (read-value-for-param config section "datestyle" :template template))) - (setf (params-gucs params) (append (when encoding (list (cons "client_encoding" encoding))) (when datestyle (list (cons "datestyle" datestyle))) - (get-gucs config section) - (when template (get-gucs config template)) - (get-gucs config *global-section*))))) + (merge-gucs + (get-gucs config section) + (when template (get-gucs config template)) + (get-gucs config *global-section*)))))) (defun get-gucs (config section) "Get PostgreSQL settings from SECTION." @@ -73,6 +73,14 @@ when (and (< 10 (length option)) (string= "pg_option_" option :end2 10)) collect (cons (subseq option 10) value))) +(defun merge-gucs (&rest gucs) + "Merge several guc lists into a consolidated one. When the same GUC is + found more than once, we keep the one found first." + (remove-duplicates (apply #'append gucs) + :from-end t + :key #'car + :test #'string=)) + (defun user-defined-columns (config section) "Fetch all option that begin with udc_ as user defined columns" (loop for (option . value) in (ini:items config section) @@ -96,7 +104,7 @@ for name in (list-columns dbname table-name) collect (cons name pos)))) -(defun parse-columns-spec (string config section) +(defun parse-columns-spec (string config section &key trailing-sep) "Parse old-style columns specification, such as: * --> nil x, y, a, b, d:6, c:5 --> \"x, y, a, b, d, c\" @@ -107,7 +115,9 @@ (if (string= string "*") (get-pgsql-column-specs config section) (split-columns-specs string)))) - (values (mapcar #'car (sort (copy-list colspecs) #'< :key #'cdr)) + (values (append + (mapcar #'car (sort (copy-list colspecs) #'< :key #'cdr)) + (when trailing-sep '("trailing"))) (mapcar #'car colspecs)))) (defun parse-only-cols (columns only-cols) @@ -181,6 +191,8 @@ ;; now parse fields and columns (let* ((template (params-use-template params)) + (trailing-sep (read-value-for-param config section "trailing_sep" + :template template)) (columns (read-value-for-param config section "columns" :template template)) (user-defined (append @@ -195,11 +207,14 @@ ;; make sense of the old cruft (multiple-value-bind (fields columns) - (parse-columns-spec columns config section) - (setf (params-fields params) fields) - (setf (params-columns params) - (compute-columns columns only-cols copy-columns user-defined - config section)))) + (parse-columns-spec columns config section :trailing-sep trailing-sep) + (setf (params-fields params) fields) + (setf (params-columns params) (compute-columns columns + only-cols + copy-columns + user-defined + config + section)))) params)) (defun get-connection-params (config section) @@ -265,8 +280,12 @@ (skip-lines (when value (format nil "skip header = ~a" value)))))) -(defun write-command-to-string (config section &key with-data-inline) - "Return the new syntax for the command found in SECTION." +(defun write-command-to-string (config section + &key with-data-inline (end-command t)) + "Return the new syntax for the command found in SECTION. + + When WITH-DATA-INLINE is true, instead of using the SECTION's filename + option, use the constant INLINE in the command." (let ((params (parse-section config section))) (when (and (params-filename params) (params-separator params)) @@ -290,9 +309,12 @@ when option collect it)) ;; GUCs - (format s "~% SET ~{~a~^,~&~10T~};" + (format s "~% SET ~{~a~^,~&~10T~}" (loop for (name . setting) in (params-gucs params) - collect (format nil "~a to '~a'" name setting))))))) + collect (format nil "~a to '~a'" name setting))) + + ;; End the command with a semicolon, unless asked not to + (format s "~@[;~]" end-command))))) (defun convert-ini-into-commands (filename) "Read the INI file at FILENAME and convert each section of it to a command @@ -303,9 +325,21 @@ for command = (write-command-to-string config section) when command collect it)))) -(defun convert-ini-into-files (filename target-directory &key with-data-inline) +(defun convert-ini-into-files (filename target-directory + &key + with-data-inline + include-sql-file) "Reads the INI file at FILENAME and creates files names
.load for - each section in the INI file, in TARGET-DIRECTORY." + each section in the INI file, in TARGET-DIRECTORY. + + When WITH-DATA-INLINE is true, read the CSV file listed as the section's + filename and insert its content in the command itself, as inline data. + + When INCLUDE-SQL-FILE is :if-exists, try to find a sibling file to the + data file, with the same name and with the \"sql\" type, and use its + content in a BEFORE LOAD DO clause. + + When INCLUDE-SQL-FILE is t, not finding the SQL file is an error." (let ((config (read-ini-file filename))) ;; first mkdir -p @@ -317,7 +351,8 @@ :name section :type "load") for command = (write-command-to-string config section - :with-data-inline with-data-inline) + :with-data-inline with-data-inline + :end-command nil) when command do (with-open-file (c target :direction :output @@ -325,11 +360,41 @@ :if-does-not-exist :create :external-format :utf-8) (format c "~a" command) - (when with-data-inline - (let* ((params (parse-section config section)) - (datafile - (merge-pathnames (params-filename params) - (directory-namestring filename)))) - (format c "~%~%~%~%~a" - (slurp-file-into-string datafile))))) + + (let* ((params (parse-section config section)) + (datafile + (merge-pathnames (params-filename params) + (directory-namestring filename))) + (sqlfile + (make-pathname :directory (directory-namestring datafile) + :name (pathname-name datafile) + :type "sql")) + (sql-file-exists (probe-file sqlfile)) + (sql-commands (when sql-file-exists + (slurp-file-into-string sqlfile)))) + ;; First + (if include-sql-file + (if sql-file-exists + (progn + (format c "~%~% BEFORE LOAD DO") + (format c "~{~&~3T$$ ~a; $$~^,~};~%" + (remove-if + (lambda (x) + (string= "" + (string-trim '(#\Space + #\Return + #\Linefeed) x))) + (sq:split-sequence #\; sql-commands)))) + (unless (eq sql-file-exists :if-exists) + (error "File not found: ~s" sqlfile))) + ;; don't include sql file + (format c ";~%")) + + (when with-data-inline + (let* ((params (parse-section config section)) + (datafile + (merge-pathnames (params-filename params) + (directory-namestring filename)))) + (format c "~%~%~%~%~a" + (slurp-file-into-string datafile)))))) and collect target))) diff --git a/parser.lisp b/parser.lisp index 49ad869..b89d2f9 100644 --- a/parser.lisp +++ b/parser.lisp @@ -1423,7 +1423,9 @@ Here's a quick description of the format we're parsing here: ;; normal error processing happen (parse 'commands content))))) -(defun run-commands (source) +(defun run-commands (source + &key + ((:client-min-messages *client-min-messages*) *client-min-messages*)) "SOURCE can be a function, which is run, a list, which is compiled as CL code then run, a pathname containing one or more commands that are parsed then run, or a commands string that is then parsed and each command run." diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..08f777c --- /dev/null +++ b/test/README.md @@ -0,0 +1,4 @@ +# pgloader tests + +In the `parser` directory are tests for the parser only, in the current +directory are tests that can be run to import data. diff --git a/test/allcols.load b/test/allcols.load new file mode 100644 index 0000000..2b8569e --- /dev/null +++ b/test/allcols.load @@ -0,0 +1,32 @@ +LOAD CSV + FROM inline (a, b, c) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?allcols + (a, b, c) + + WITH fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ':' + + SET client_encoding to 'latin1', + work_mem to '14MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ create table if not exists allcols ( + a integer primary key, + b date, + c text + ); + $$; + + + + +1:2008-02-18:first entry +2:2008-02-19:second one +3:2008-02-20:another +4:2008-02-21:still running +5:2008-02-22:well, some more +6:2008-02-23:antepenultima +7:2008-02-24:next to last +8:2008-02-25:hey, it's today! diff --git a/test/csv.load b/test/csv.load index 5d071e8..9ba1651 100644 --- a/test/csv.load +++ b/test/csv.load @@ -1,9 +1,46 @@ LOAD CSV - FROM '/Users/dim/dev/CL/pgloader/galaxya/yagoa/communaute_profil.csv' - INTO postgresql://dim@localhost:54393/yagoa?communaute_profil + FROM inline + ( + x, + y, + a, + b, + c, + d + ) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?csv + ( + a, + b, + d, + c + ) - WITH truncate, - fields not enclosed, - fields terminated by '\t' + WITH truncate, + skip header = 1, + fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ',' - SET work_mem to '32 MB', maintenance_work_mem to '64 MB'; + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ CREATE TABLE csv ( + a bigint, + b bigint, + c char(2), + d text +); $$; + + + + +Stupid useless header with a © sign +"2.6.190.56","2.6.190.63","33996344","33996351","GB","United Kingdom" +"3.0.0.0","4.17.135.31","50331648","68257567","US","United States" +"4.17.135.32","4.17.135.63","68257568","68257599","CA","Canada" +"4.17.135.64","4.17.142.255","68257600","68259583","US","United States" +"4.17.143.0","4.17.143.15","68259584","68259599","CA","Canada" +"4.17.143.16","4.18.32.71","68259600","68296775","US","United States" diff --git a/test/errors.load b/test/errors.load new file mode 100644 index 0000000..33adba8 --- /dev/null +++ b/test/errors.load @@ -0,0 +1,31 @@ +LOAD CSV + FROM inline (a, c, b, trailing) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?errors + (a, b, c) + + WITH fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by '|' + + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ create table if not exists errors ( + a integer primary key, + b date, + c text + ); + $$; + + + + +1|some first row text|2006-13-11| +2|some second row text|2006-11-11| +3|some third row text|2006-10-12| +4|\ |2006-16-4| +5|some fifth row text|2006-5-12| +6|some sixth row text|2006-13-10| +7|some null date to play with|| diff --git a/test/csv-with-projection.load b/test/parse/csv-with-projection.load similarity index 100% rename from test/csv-with-projection.load rename to test/parse/csv-with-projection.load diff --git a/test/parse/csv.load b/test/parse/csv.load new file mode 100644 index 0000000..5d071e8 --- /dev/null +++ b/test/parse/csv.load @@ -0,0 +1,9 @@ +LOAD CSV + FROM '/Users/dim/dev/CL/pgloader/galaxya/yagoa/communaute_profil.csv' + INTO postgresql://dim@localhost:54393/yagoa?communaute_profil + + WITH truncate, + fields not enclosed, + fields terminated by '\t' + + SET work_mem to '32 MB', maintenance_work_mem to '64 MB'; diff --git a/test/database.load b/test/parse/database.load similarity index 100% rename from test/database.load rename to test/parse/database.load diff --git a/test/hans.goeuro.load b/test/parse/hans.goeuro.load similarity index 100% rename from test/hans.goeuro.load rename to test/parse/hans.goeuro.load diff --git a/test/messages.load b/test/parse/messages.load similarity index 100% rename from test/messages.load rename to test/parse/messages.load diff --git a/test/mix.load b/test/parse/mix.load similarity index 100% rename from test/mix.load rename to test/parse/mix.load diff --git a/test/my.load b/test/parse/my.load similarity index 100% rename from test/my.load rename to test/parse/my.load diff --git a/test/partial.load b/test/partial.load new file mode 100644 index 0000000..9da1fad --- /dev/null +++ b/test/partial.load @@ -0,0 +1,33 @@ +LOAD CSV + FROM inline (a, b, c, d, e) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?partial + (a, b, c, e) + + WITH fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by '%' + + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ create table if not exists partial ( + a integer primary key, + b text, + c text, + d text, + e text + ); + $$; + + + + +1%foo%bar%baz%hop +2%foo%bar%baz%hop +3%foo%bar%baz%hop +4%foo%bar%baz%hop +5%foo%bar%baz%hop +6%foo%bar%baz%hop +7%foo%bar%baz%hop diff --git a/test/reformat.load b/test/reformat.load new file mode 100644 index 0000000..4f6daa3 --- /dev/null +++ b/test/reformat.load @@ -0,0 +1,31 @@ +LOAD CSV + FROM inline (id, timestamp) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?reformat + ( + id, + timestamp timestamptz using (date-with-no-separator timestamp) + ) + + WITH fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by '|' + + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ create table if not exists reformat ( + id integer primary key, + timestamp timestamp with time zone + ); + $$; + + + + +1|20071119150718 +2|20041002153048 +3|20060111060850 +4|20060111060958 +5|00000000000000 diff --git a/test/serial.load b/test/serial.load new file mode 100644 index 0000000..92961e2 --- /dev/null +++ b/test/serial.load @@ -0,0 +1,31 @@ +LOAD CSV + FROM inline (c, b) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?serial + (b, c) + + WITH fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ';' + + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ create table if not exists serial ( + a serial primary key, + b date, + c text + ); + $$; + + + + +some first row text;2006-11-11 +some second row text;2006-11-11 +some third row text;2006-10-12 +\ ;2006-10-4 +some fifth row text;2006-5-12 +some sixth row text;2006-7-10 +some null date to play with; diff --git a/test/simple.load b/test/simple.load new file mode 100644 index 0000000..06b8cc5 --- /dev/null +++ b/test/simple.load @@ -0,0 +1,36 @@ +LOAD CSV + FROM inline (a, c, b, trailing) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?simple + (a, b, c) + + WITH truncate, + skip header = 2, + fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by '|' + + SET client_encoding to 'latin1', + datestyle to 'dmy', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ CREATE TABLE if not exists simple ( + a integer primary key, + b date, + c text + ); + $$; + + + + +This is a stupid useless header like you sometime find in CSV files +id|data|date| +1|some first row text|2006-11-11| +2|some second row text|13/11/2006| +3|some third row text|12-10-2006| +4|\ |2006-10-4| +5|some fifth row text|2006-5-12| +6|some sixth row text|10/7/6| +7|some null date to play with|| diff --git a/test/udc.load b/test/udc.load new file mode 100644 index 0000000..7db5b27 --- /dev/null +++ b/test/udc.load @@ -0,0 +1,34 @@ +LOAD CSV + FROM inline WITH ENCODING latin1 + (d, b, x, y) + INTO postgresql://dim:pgpass@localhost:54393/pgloader?udc + ( + b, + c text using "constant value", + d + ) + + WITH fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by '%' + + SET client_encoding to 'latin1', + work_mem to '12MB', + standard_conforming_strings to 'on' + + BEFORE LOAD DO + $$ create table if not exists udc ( + b integer primary key, + c text, + d integer + ); + $$; + + + + +1%5%foo%bar +2%10%bar%toto +3%4%toto%titi +4%18%titi%baz +5%2%baz%foo diff --git a/test/xzero.load b/test/xzero.load new file mode 100644 index 0000000000000000000000000000000000000000..d5d110528ff1633fe9ad585ad84790ccee0c094c GIT binary patch literal 870 zcmZvaZEM>w6ovQMzv40qX&}aNURstiwslQupHx0mwZ!3{nq|fXLc2Zu`GlIKN(o6E5V_* zDAC$O@1)owG`a?>OC(XEZ=d4cJp6I~*|U-0&fBqYK^LVipk6YT!06Z3+l_|9i#SdF zbkV3IEc%G}8