diff --git a/pgloader.1 b/pgloader.1 index 0ac7c0b..3c6d074 100644 --- a/pgloader.1 +++ b/pgloader.1 @@ -1,7 +1,7 @@ .\" generated with Ronn/v0.7.3 .\" http://github.com/rtomayko/ronn/tree/0.7.3 . -.TH "PGLOADER" "1" "April 2015" "ff" "" +.TH "PGLOADER" "1" "July 2015" "ff" "" . .SH "NAME" \fBpgloader\fR \- PostgreSQL data loader @@ -679,7 +679,7 @@ When omitted, the \fIdbname\fR defaults to the value of the environment variable The optional parameters must be supplied with the form \fBname=value\fR, and you may use several parameters by separating them away using an ampersand (\fB&\fR) character\. . .IP -Only two options are supported here, \fItablename\fR (which might be qualified with a schema name) and \fIsslmode\fR\. +Only some options are supported here, \fItablename\fR (which might be qualified with a schema name) \fIsslmode\fR, \fIhost\fR, \fIport\fR, \fIdbname\fR, \fIuser\fR and \fIpassword\fR\. . .IP The \fIsslmode\fR parameter values can be one of \fBdisable\fR, \fBallow\fR, \fBprefer\fR or \fBrequire\fR\. @@ -687,6 +687,9 @@ The \fIsslmode\fR parameter values can be one of \fBdisable\fR, \fBallow\fR, \fB .IP For backward compatibility reasons, it\'s possible to specify the \fItablename\fR option directly, without spelling out the \fBtablename=\fR parts\. . +.IP +The options override the main URI components when both are given, and using the percent\-encoded option parameters allow using passwords starting with a colon and bypassing other URI components parsing limitations\. +. .IP "" 0 . .SS "Regular Expressions" @@ -967,6 +970,15 @@ When loading from a \fBCSV\fR file, the following options are supported: When this option is listed, pgloader issues a \fBTRUNCATE\fR command against the PostgreSQL target table before reading the data file\. . .IP "\(bu" 4 +\fIdrop indexes\fR +. +.IP +When this option is listed, pgloader issues \fBDROP INDEX\fR commands against all the indexes defined on the target table before copying the data, then \fBCREATE INDEX\fR commands once the \fBCOPY\fR is done\. +. +.IP +In order to get the best performances possible, all the indexes are created in parallel and when done the primary keys are built again from the unique indexes just created\. This two step process allows creating the primary key index in parallel with the other indexes, as only the \fBALTER TABLE\fR command needs an \fIaccess exclusive lock\fR on the target table\. +. +.IP "\(bu" 4 \fIdisable triggers\fR . .IP @@ -982,6 +994,12 @@ This option allows loading data into a pre\-existing table ignoring the \fIforei Takes a numeric value as argument\. Instruct pgloader to skip that many lines at the beginning of the input file\. . .IP "\(bu" 4 +\fIcsv header\fR +. +.IP +Use the first line read after \fIskip header\fR as the list of csv field names to be found in the CSV file, using the same CSV parameters as for the CSV data\. +. +.IP "\(bu" 4 \fItrim unquoted blanks\fR . .IP @@ -1012,7 +1030,13 @@ By default, pgloader will use the double\-quote character as the enclosing chara \fIfields escaped by\fR . .IP -Takes either the special value \fIbackslash\-quote\fR or \fIdouble\-quote\fR\. This value is used to recognize escaped field separators when they are to be found within the data fields themselves\. Defaults to \fIdouble\-quote\fR\. +Takes either the special value \fIbackslash\-quote\fR or \fIdouble\-quote\fR, or any value supported by the \fIfields terminated by\fR option (see below)\. This value is used to recognize escaped field separators when they are to be found within the data fields themselves\. Defaults to \fIdouble\-quote\fR\. +. +.IP "\(bu" 4 +\fIcsv escape mode\fR +. +.IP +Takes either the special value \fIquote\fR (the default) or \fIfollowing\fR and allows the CSV parser to parse either only escaped field separator or any character (including CSV data) when using the \fIfollowing\fR value\. . .IP "\(bu" 4 \fIfields terminated by\fR @@ -1684,7 +1708,10 @@ When loading from a \fBMySQL\fR database, the following options are supported, a \fIinclude drop\fR . .IP -When this option is listed, pgloader drop in the PostgreSQL connection all the table whose names have been found in the MySQL database\. This option allows for using the same command several times in a row until you figure out all the options, starting automatically from a clean environment\. +When this option is listed, pgloader drops all the tables in the target PostgreSQL database whose names appear in the SQLite database\. This option allows for using the same command several times in a row until you figure out all the options, starting automatically from a clean environment\. Please note that \fBCASCADE\fR is used to ensure that tables are dropped even if there are foreign keys pointing to them\. This is precisely what \fBinclude drop\fR is intended to do: drop all target tables and recreate them\. +. +.IP +Great care needs to be taken when using \fBinclude drop\fR, as it will cascade to \fIall\fR objects referencing the target tables, possibly including other tables that are not being loaded from the source DB\. . .IP "\(bu" 4 \fIinclude no drop\fR @@ -2253,7 +2280,10 @@ When loading from a \fBSQLite\fR database, the following options are supported, \fIinclude drop\fR . .IP -When this option is listed, pgloader drop in the PostgreSQL connection all the table whose names have been found in the SQLite database\. This option allows for using the same command several times in a row until you figure out all the options, starting automatically from a clean environment\. +When this option is listed, pgloader drops all the tables in the target PostgreSQL database whose names appear in the SQLite database\. This option allows for using the same command several times in a row until you figure out all the options, starting automatically from a clean environment\. Please note that \fBCASCADE\fR is used to ensure that tables are dropped even if there are foreign keys pointing to them\. This is precisely what \fBinclude drop\fR is intended to do: drop all target tables and recreate them\. +. +.IP +Great care needs to be taken when using \fBinclude drop\fR, as it will cascade to \fIall\fR objects referencing the target tables, possibly including other tables that are not being loaded from the source DB\. . .IP "\(bu" 4 \fIinclude no drop\fR diff --git a/pgloader.1.md b/pgloader.1.md index 4d82c57..e676967 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -830,6 +830,19 @@ The `csv` format command accepts the following clauses and options: When this option is listed, pgloader issues a `TRUNCATE` command against the PostgreSQL target table before reading the data file. + - *drop indexes* + + When this option is listed, pgloader issues `DROP INDEX` commands + against all the indexes defined on the target table before copying + the data, then `CREATE INDEX` commands once the `COPY` is done. + + In order to get the best performances possible, all the indexes are + created in parallel and when done the primary keys are built again + from the unique indexes just created. This two step process allows + creating the primary key index in parallel with the other indexes, + as only the `ALTER TABLE` command needs an *access exclusive lock* + on the target table. + - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... diff --git a/src/package.lisp b/src/package.lisp index 9ee1857..dad8b6a 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -200,6 +200,7 @@ #:format-pgsql-create-index #:create-indexes-in-kernel #:set-table-oids + #:drop-indexes #:reset-sequences)) (defpackage #:pgloader.queue @@ -246,7 +247,11 @@ #:pgloader.params #:pgloader.utils #:pgloader.connection #:pgloader.sources #:pgloader.queue) (:import-from #:pgloader.pgsql - #:list-indexes) + #:list-indexes + #:drop-indexes + #:with-pgsql-connection + #:pgsql-execute-with-timing + #:create-indexes-in-kernel) (:export #:*csv-path-root* #:csv-connection #:specs diff --git a/src/parsers/command-csv.lisp b/src/parsers/command-csv.lisp index 69214c2..9247074 100644 --- a/src/parsers/command-csv.lisp +++ b/src/parsers/command-csv.lisp @@ -108,6 +108,7 @@ option-batch-concurrency option-truncate option-disable-triggers + option-drop-indexes option-skip-header option-csv-header option-lines-terminated-by @@ -443,7 +444,8 @@ (let* ((state-before (pgloader.utils:make-pgstate)) (summary (null *state*)) (*state* (or *state* (pgloader.utils:make-pgstate))) - (state-after ,(when after `(pgloader.utils:make-pgstate))) + (state-after ,(when (or after (getf options :drop-indexes)) + `(pgloader.utils:make-pgstate))) ,@(pgsql-connection-bindings pg-db-conn gucs) ,@(batch-control-bindings options) (source-db (with-stats-collection ("fetch" :state state-before) @@ -454,6 +456,7 @@ (let ((truncate (getf ',options :truncate)) (disable-triggers (getf ',options :disable-triggers)) + (drop-indexes (getf ',options :drop-indexes)) (source (make-instance 'pgloader.csv:copy-csv :target-db ,pg-db-conn @@ -464,9 +467,13 @@ :columns ',columns ,@(remove-batch-control-option options :extras '(:truncate + :drop-indexes :disable-triggers))))) (pgloader.sources:copy-from source + :state-before state-before + :state-after state-after :truncate truncate + :drop-indexes drop-indexes :disable-triggers disable-triggers)) ,(sql-code-block pg-db-conn 'state-after after "after load") diff --git a/src/parsers/command-options.lisp b/src/parsers/command-options.lisp index 28877d5..580e9d6 100644 --- a/src/parsers/command-options.lisp +++ b/src/parsers/command-options.lisp @@ -107,6 +107,7 @@ (make-option-rule include-drop (and kw-include (? kw-no) kw-drop)) (make-option-rule truncate (and (? kw-no) kw-truncate)) (make-option-rule disable-triggers (and kw-disable (? kw-no) kw-triggers)) +(make-option-rule drop-indexes (and kw-drop (? kw-no) kw-indexes)) (make-option-rule create-tables (and kw-create (? kw-no) kw-tables)) (make-option-rule create-indexes (and kw-create (? kw-no) kw-indexes)) (make-option-rule reset-sequences (and kw-reset (? kw-no) kw-sequences)) diff --git a/src/pgsql/queries.lisp b/src/pgsql/queries.lisp index 2dd85e4..ea932d2 100644 --- a/src/pgsql/queries.lisp +++ b/src/pgsql/queries.lisp @@ -189,11 +189,11 @@ where c.oid = '~:[~*~a~;~a.~a~]'::regclass and attnum > 0 order by attnum" schema schema table-name) :column))) -(defun list-indexes (pgconn table-name) - "List all indexes for TABLE-NAME in SCHEMA." - (with-pgsql-connection (pgconn) - (loop :for (index-name table-name table-oid primary sql) - :in (pomo:query (format nil " +(defun list-indexes (table-name) + "List all indexes for TABLE-NAME in SCHEMA. A PostgreSQL connection must + be already established when calling that function." + (loop :for (index-name table-name table-oid primary sql) + :in (pomo:query (format nil " select i.relname, indrelid::regclass, indrelid, @@ -202,17 +202,17 @@ select i.relname, from pg_index x join pg_class i ON i.oid = x.indexrelid where indrelid = '~@[~a.~]~a'::regclass" - (when (typep table-name 'cons) - (car table-name)) - (typecase table-name - (cons (cdr table-name)) - (string table-name)))) - :collect (make-pgsql-index :name index-name - :table-name table-name - :table-oid table-oid - :primary primary - :columns nil - :sql sql)))) + (when (typep table-name 'cons) + (car table-name)) + (typecase table-name + (cons (cdr table-name)) + (string table-name)))) + :collect (make-pgsql-index :name index-name + :table-name table-name + :table-oid table-oid + :primary primary + :columns nil + :sql sql))) (defun list-reserved-keywords (pgconn) "Connect to PostgreSQL DBNAME and fetch reserved keywords." diff --git a/src/pgsql/schema.lisp b/src/pgsql/schema.lisp index faffe22..4cf22dc 100644 --- a/src/pgsql/schema.lisp +++ b/src/pgsql/schema.lisp @@ -298,7 +298,7 @@ index-name table-name cols)) (format nil "ALTER TABLE ~a ADD PRIMARY KEY USING INDEX ~a;" - table-name index-name))) + table-name (pgsql-index-name index)))) (t (or (pgsql-index-sql index) @@ -308,6 +308,17 @@ table-name cols)))))) +(defmethod format-pgsql-drop-index ((index pgsql-index)) + "Generate the PostgreSQL statement to DROP the index." + (let* ((table-name (apply-identifier-case (pgsql-index-table-name index))) + (index-name (apply-identifier-case (pgsql-index-name index)))) + (cond ((pgsql-index-primary index) + (format nil "ALTER TABLE ~a DROP CONSTRAINT ~a;" + table-name index-name)) + + (t + (format nil "DROP INDEX ~a;" index-name))))) + ;;; ;;; Parallel index building. ;;; @@ -358,6 +369,16 @@ do (loop for index in indexes do (setf (pgsql-index-table-oid index) table-oid))))) +;;; +;;; Drop indexes before loading +;;; +(defun drop-indexes (state pgsql-index-list) + "Drop indexes in PGSQL-INDEX-LIST. A PostgreSQL connection must already be + active when calling that function." + (loop :for index :in pgsql-index-list + :do (let ((sql (format-pgsql-drop-index index))) + (log-message :notice "~a" sql) + (pgsql-execute-with-timing "drop indexes" sql state)))) ;;; diff --git a/src/sources/csv/csv.lisp b/src/sources/csv/csv.lisp index a49599f..9779528 100644 --- a/src/sources/csv/csv.lisp +++ b/src/sources/csv/csv.lisp @@ -173,21 +173,36 @@ "Copy data from given CSV definition into lparallel.queue DATAQ" (map-push-queue csv queue)) -(defmethod copy-from ((csv copy-csv) &key truncate disable-triggers) +(defmethod copy-from ((csv copy-csv) + &key + state-before + state-after + truncate + disable-triggers + drop-indexes) "Copy data from given CSV file definition into its PostgreSQL target table." (let* ((summary (null *state*)) (*state* (or *state* (pgloader.utils:make-pgstate))) (lp:*kernel* (make-kernel 2)) (channel (lp:make-channel)) (queue (lq:make-queue :fixed-capacity *concurrent-batches*)) - (indexes (list-indexes (target-db csv) - (target csv)))) + indexes) ;; issue a performance warning against pre-existing indexes - (when indexes - (log-message :warning "Target table ~s has ~d indexes defined against it." - (target csv) (length indexes)) - (log-message :warning "That could impact loading performance badly")) + (with-pgsql-connection ((target-db csv)) + (setf indexes (list-indexes (target csv))) + (cond ((and indexes (not drop-indexes)) + (log-message :warning + "Target table ~s has ~d indexes defined against it." + (target csv) (length indexes)) + (log-message :warning "That could impact loading performance badly")) + + (indexes + + ;; drop the indexes now + (with-stats-collection ("drop indexes" :state state-before + :summary summary) + (drop-indexes state-before indexes))))) (with-stats-collection ((target csv) :dbname (db-name (target-db csv)) @@ -211,4 +226,29 @@ ;; now wait until both the tasks are over (loop for tasks below 2 do (lp:receive-result channel) - finally (lp:end-kernel)))))) + finally (lp:end-kernel)))) + + ;; re-create the indexes + (when (and indexes drop-indexes) + (let* ((idx-kernel (make-kernel (length indexes))) + (idx-channel (let ((lp:*kernel* idx-kernel)) + (lp:make-channel)))) + (let ((pkeys + (create-indexes-in-kernel (target-db csv) + indexes + idx-kernel + idx-channel + :state state-after))) + (with-stats-collection ("Index Build Completion" :state *state*) + (loop :for idx :in indexes :do (lp:receive-result idx-channel))) + + ;; turn unique indexes into pkeys now + (with-pgsql-connection ((target-db csv)) + (with-stats-collection ("Primary Keys" :state state-after) + (loop :for sql :in pkeys + :when sql + :do (progn + (log-message :notice "~a" sql) + (pgsql-execute-with-timing "Primary Keys" + sql + state-after)))))))))) diff --git a/test/partial.load b/test/partial.load index ef52075..06a1a80 100644 --- a/test/partial.load +++ b/test/partial.load @@ -14,7 +14,8 @@ LOAD CSV FROM inline (a, b, c, d, e) INTO postgresql:///pgloader?partial (a, b, c, e) - WITH fields optionally enclosed by '"', + WITH drop indexes, + fields optionally enclosed by '"', fields escaped by double-quote, fields terminated by '%'