From e1d2bd13181f475e609ec73ef8f90da1c1f6ab9b Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sun, 13 Oct 2013 22:45:29 +0200 Subject: [PATCH] Improve LOAD DBF command (support for http and zip). --- db3.lisp | 63 +++++++++++++++++++++++++---------------------- parser.lisp | 44 ++++++++++++++++++++++++--------- pgloader.1.md | 25 +++++++++++++------ test/dbf-zip.load | 5 ++++ test/dbf.load | 7 +++--- 5 files changed, 94 insertions(+), 50 deletions(-) create mode 100644 test/dbf-zip.load diff --git a/db3.lisp b/db3.lisp index 1c17929..dc835a9 100644 --- a/db3.lisp +++ b/db3.lisp @@ -117,42 +117,47 @@ (defun stream-file (filename &key dbname + state-before (table-name (pathname-name filename)) create-table truncate) "Open the DB3 and stream its content to a PostgreSQL database." - (with-pgsql-transaction (dbname) - (when create-table - (let ((create-table-sql (db3-create-table filename))) - (log-message :notice "Create table \"~a\"" table-name) - (log-message :info "~a" create-table-sql) - (pgsql-execute create-table-sql))) + (let* ((summary (null *state*)) + (*state* (or *state* (make-pgstate)))) - (when (and truncate (not create-table)) - ;; we don't TRUNCATE a table we just CREATEd - (let ((truncate-sql (format nil "TRUNCATE ~a;" table-name))) - (log-message :notice "~a" truncate-sql) - (pgsql-execute truncate-sql)))) + (with-stats-collection (dbname "create, truncate" + :state state-before + :summary summary) + (with-pgsql-transaction (dbname) + (when create-table + (let ((create-table-sql (db3-create-table filename))) + (log-message :notice "Create table \"~a\"" table-name) + (log-message :info "~a" create-table-sql) + (pgsql-execute create-table-sql))) - (let* ((*state* (make-pgstate)) - (lp:*kernel* (make-kernel 2)) - (channel (lp:make-channel)) - (dataq (lq:make-queue :fixed-capacity 4096))) + (when (and truncate (not create-table)) + ;; we don't TRUNCATE a table we just CREATEd + (let ((truncate-sql (format nil "TRUNCATE ~a;" table-name))) + (log-message :notice "~a" truncate-sql) + (pgsql-execute truncate-sql))))) - (with-stats-collection (dbname table-name :state *state* :summary t) - (log-message :notice "COPY \"~a\" from '~a'" table-name filename) - (lp:submit-task channel #'copy-to-queue filename dataq table-name) + (let* ((lp:*kernel* (make-kernel 2)) + (channel (lp:make-channel)) + (dataq (lq:make-queue :fixed-capacity 4096))) - ;; and start another task to push that data from the queue to PostgreSQL - (lp:submit-task channel - #'pgloader.pgsql:copy-from-queue - dbname table-name dataq - :truncate truncate - :transforms (transforms filename)) + (with-stats-collection (dbname table-name :state *state* :summary summary) + (log-message :notice "COPY \"~a\" from '~a'" table-name filename) + (lp:submit-task channel #'copy-to-queue filename dataq table-name) - ;; now wait until both the tasks are over, and kill the kernel - (loop for tasks below 2 do (lp:receive-result channel) - finally - (log-message :info "COPY \"~a\" done." table-name) - (lp:end-kernel))))) + ;; and start another task to push that data from the queue to PostgreSQL + (lp:submit-task channel + #'pgloader.pgsql:copy-from-queue + dbname table-name dataq + :truncate truncate + :transforms (transforms filename)) + ;; now wait until both the tasks are over, and kill the kernel + (loop for tasks below 2 do (lp:receive-result channel) + finally + (log-message :info "COPY \"~a\" done." table-name) + (lp:end-kernel)))))) diff --git a/parser.lisp b/parser.lisp index b89d2f9..28885ca 100644 --- a/parser.lisp +++ b/parser.lisp @@ -831,31 +831,53 @@ Here's a quick description of the format we're parsing here: (declare (ignore w)) opts))) -(defrule dbf-source (and kw-load kw-dbf kw-from maybe-quoted-filename) +(defrule dbf-source (and kw-load kw-dbf kw-from filename-or-http-uri) (:lambda (src) (destructuring-bind (load dbf from source) src (declare (ignore load dbf from)) - ;; source is (:filename #P"pathname/here") - (destructuring-bind (type uri) source - (ecase type - (:filename uri)))))) + source))) -(defrule load-dbf-file (and dbf-source target dbf-options) +(defrule load-dbf-file (and dbf-source target dbf-options (? gucs)) (:lambda (command) - (destructuring-bind (source pg-db-uri options) command + (destructuring-bind (source pg-db-uri options gucs) command (destructuring-bind (&key host port user password dbname table-name &allow-other-keys) pg-db-uri `(lambda () - (let* ((*pgconn-host* ,host) + (let* ((state-before (pgloader.utils:make-pgstate)) + (*state* (pgloader.utils:make-pgstate)) + (source + ,(destructuring-bind (kind url) source + (ecase kind + (:http `(with-stats-collection + (,dbname "download" :state state-before) + (pgloader.archive:http-fetch-file ,url))) + (:filename url)))) + (source + (if (string= "zip" (pathname-type source)) + (progn + (with-stats-collection (,dbname "extract" + :state state-before) + (let ((d (pgloader.archive:expand-archive source))) + (merge-pathnames + (make-pathname :name (pathname-name source) + :type "dbf") + d)))) + source)) + (*pgconn-host* ,host) (*pgconn-port* ,port) (*pgconn-user* ,user) - (*pgconn-pass* ,password)) - (pgloader.db3:stream-file ,source + (*pgconn-pass* ,password) + (*pg-settings* ',gucs)) + (pgloader.db3:stream-file source + :state-before state-before :dbname ,dbname ,@(when table-name (list :table-name table-name)) - ,@options))))))) + ,@options) + + (report-full-summary *state* state-before nil + "Total import time"))))))) #| diff --git a/pgloader.1.md b/pgloader.1.md index f96e5a1..fc6ccd3 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -266,20 +266,22 @@ This command instructs pgloader to load data from a `DBF` file. Here's an example: LOAD DBF - FROM '/Users/dim/Downloads/comsimp2013.dbf' - INTO postgresql://dim@localhost:54393/dim?comsimp2013 - WITH truncate, create table, table name = 'comsimp2013'; + FROM http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2013/dbf/reg2013.dbf + INTO postgresql://dim@localhost:54393/dim + WITH truncate, create table; -The `csv` format command accepts the following clauses and options: +The `dbf` format command accepts the following clauses and options: - *FROM* - Filename where to load the data from. + Filename where to load the data from. This support local files, HTTP + URLs and zip files containing a single dbf file of the same name. Fetch + such a zip file from an HTTP address is of course supported. - *INTO* - The PostgreSQL connection URI must contains the possibly qualified name - of the target table where to load the data into. + The PostgreSQL connection URI. If it doesn't have a table name in the + target, then the name part of the filename will be used as a table name. - *WITH* @@ -302,6 +304,15 @@ The `csv` format command accepts the following clauses and options: This options expects as its value the possibly qualified name of the table to create. + - *SET* + + This clause allows to specify session parameters to be set for all the + sessions opened by pgloader. It expects a list of parameter name, the + equal sign, then the single-quoted value as a comma separated list. + + The names and values of the parameters are not validated by pgloader, + they are given as-is to PostgreSQL. + ## LOAD ARCHIVE This command instructs pgloader to load data from one or more files contained diff --git a/test/dbf-zip.load b/test/dbf-zip.load new file mode 100644 index 0000000..cfc2961 --- /dev/null +++ b/test/dbf-zip.load @@ -0,0 +1,5 @@ +LOAD DBF + FROM http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2013/dbf/historiq2013.zip + INTO postgresql://dim@localhost:54393/dim + WITH truncate, create table + SET client_encoding TO 'latin1'; diff --git a/test/dbf.load b/test/dbf.load index 33f854e..4a21d7a 100644 --- a/test/dbf.load +++ b/test/dbf.load @@ -1,4 +1,5 @@ LOAD DBF - FROM '/Users/dim/Downloads/comsimp2013.dbf' - INTO postgresql://dim@localhost:54393/dim?comsimp2013 - WITH truncate, create table, table name = 'comsimp2013'; + FROM http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2013/dbf/reg2013.dbf + INTO postgresql://dim@localhost:54393/dim + WITH truncate, create table + SET client_encoding TO 'latin1';