From 46d14af0d38b26b066876ecf45a2569da31f1d7e Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sun, 8 Jul 2018 20:34:55 +0200 Subject: [PATCH 01/69] Add more default rules to MySQL datetime handling. Given the variety of ways to setup default behavior for datetime and timestamp data types in MySQL, we need yet more default casting rules. It might be time to think about a more principled way to solve the problem, but on the other hand, this ad-hoc one also comes with full overriding flexibility for the end user. Fixes #811. --- src/sources/mysql/mysql-cast-rules.lisp | 8 ++++++++ test/mysql/my.sql | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/sources/mysql/mysql-cast-rules.lisp b/src/sources/mysql/mysql-cast-rules.lisp index e53daa3..f05ecde 100644 --- a/src/sources/mysql/mysql-cast-rules.lisp +++ b/src/sources/mysql/mysql-cast-rules.lisp @@ -119,6 +119,10 @@ :target (:type "timestamptz" :drop-default t :drop-not-null t) :using pgloader.transforms::zero-dates-to-null) + (:source (:type "datetime" :on-update-current-timestamp t :not-null nil) + :target (:type "timestamptz" :drop-default t) + :using pgloader.transforms::zero-dates-to-null) + (:source (:type "timestamp" :default "0000-00-00 00:00:00" :not-null t) :target (:type "timestamptz" :drop-default t :drop-not-null t) :using pgloader.transforms::zero-dates-to-null) @@ -131,6 +135,10 @@ :target (:type "timestamptz" :drop-default t :drop-not-null t) :using pgloader.transforms::zero-dates-to-null) + (:source (:type "timestamp" :on-update-current-timestamp t :not-null nil) + :target (:type "timestamptz" :drop-default t) + :using pgloader.transforms::zero-dates-to-null) + (:source (:type "date" :default "0000-00-00") :target (:type "date" :drop-default t) :using pgloader.transforms::zero-dates-to-null) diff --git a/test/mysql/my.sql b/test/mysql/my.sql index 17cd3b6..a9daefd 100644 --- a/test/mysql/my.sql +++ b/test/mysql/my.sql @@ -100,6 +100,24 @@ create table bits insert into bits(bool) values(0b00), (0b01); +/* + * https://github.com/dimitri/pgloader/issues/811 + */ +CREATE TABLE `domain_filter` ( + `id` binary(16) NOT NULL , + `type` varchar(50) NOT NULL , + `value` json DEFAULT NULL , + `negated` tinyint(1) NOT NULL DEFAULT '0' , + `report_id` varbinary(255) NOT NULL , + `query_id` varchar(255) NOT NULL , + `created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP , + `updated_at` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP , + `updated_by` varbinary(255) DEFAULT NULL , + PRIMARY KEY (`id`), + UNIQUE KEY `domain_filter_unq` (`report_id`,`query_id`,`type`), + KEY `domain_filter` (`type`) +) ENGINE=InnoDB DEFAULT CHARSET=ascii; + /* * https://github.com/dimitri/pgloader/issues/703 */ From 5ca3ee8aad1d70ce4b3cc5b05bc7bd2f88f32010 Mon Sep 17 00:00:00 2001 From: alexknips Date: Fri, 20 Jul 2018 14:38:06 +0200 Subject: [PATCH 02/69] Fix documentation of default MySQL cast rules (#815) The default rule is `type int to bigint when (>= 10 precision)`. --- docs/ref/mysql.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ref/mysql.rst b/docs/ref/mysql.rst index 1957944..dec33a9 100644 --- a/docs/ref/mysql.rst +++ b/docs/ref/mysql.rst @@ -556,7 +556,7 @@ Numbers:: type int with extra auto_increment to serial when (< precision 10) type int with extra auto_increment to bigserial when (<= 10 precision) type int to int when (< precision 10) - type int to bigint when (<= 10 precision) + type int to bigint when (>= 10 precision) type tinyint with extra auto_increment to serial type smallint with extra auto_increment to serial type mediumint with extra auto_increment to serial From 34cc25383ac3729c9c7ebcd06ab068ee24ae55f4 Mon Sep 17 00:00:00 2001 From: uniquestring <36343026+uniquestring@users.noreply.github.com> Date: Sat, 11 Aug 2018 01:08:00 +0200 Subject: [PATCH 03/69] Improved Dockerfiles/docker image size (#821) * Add dockerfiles to .dockerignore Otherwise changes in the dockerfiles would invalidate the cache * Rewrite Dockerfile - Fix deprecated MAINTAINER instruction - Move maintainer label to the bottom (improving cache) - Tidy up apt-get - Use COPY instead of ADD see https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#add-or-copy - Remove WORKDIR instruction (we don't really need this) - Combine remaining RUN layers to reduce layer count - Move final binary instead of copying (reduce image size) * Use -slim image an multistage build Reduce size by using multistage builds and the -slim image. Use debian:stable instead of an specific code name (future proof). * [cosmetic] indent Dockerfile instructions Make it easier to see where a new build stage begins * Rewrite Dockerfile.ccl Apply the same changes to Dockerfile.ccl as we did for Dockerfile --- .dockerignore | 2 ++ Dockerfile | 57 +++++++++++++++++++++++++++++++------------- Dockerfile.ccl | 64 ++++++++++++++++++++++++++++++++++---------------- 3 files changed, 87 insertions(+), 36 deletions(-) diff --git a/.dockerignore b/.dockerignore index d075b3e..6be6907 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,5 @@ .git .vagrant build +Dockerfile +Dockerfile.ccl \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 6fc43dc..0500aa2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,45 @@ -FROM debian:stretch -MAINTAINER Dimitri Fontaine +FROM debian:stable-slim as builder -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - wget curl make git bzip2 time \ - ca-certificates \ - libzip-dev libssl1.1 openssl \ - patch unzip libsqlite3-dev gawk \ - freetds-dev sbcl && \ - rm -rf /var/lib/apt/lists/* + RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bzip2 \ + ca-certificates \ + curl \ + freetds-dev \ + gawk \ + git \ + libsqlite3-dev \ + libssl1.1 \ + libzip-dev \ + make \ + openssl \ + patch \ + sbcl \ + time \ + unzip \ + wget \ + && rm -rf /var/lib/apt/lists/* -ADD ./ /opt/src/pgloader -WORKDIR /opt/src/pgloader + COPY ./ /opt/src/pgloader -# build/ is in the .dockerignore file, but we actually need it now -RUN mkdir -p build/bin -RUN make + RUN mkdir -p /opt/src/pgloader/build/bin \ + && cd /opt/src/pgloader \ + && make -RUN cp /opt/src/pgloader/build/bin/pgloader /usr/local/bin +FROM debian:stable-slim + + RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + freetds-dev \ + gawk \ + libsqlite3-dev \ + libzip-dev \ + make \ + sbcl \ + unzip \ + && rm -rf /var/lib/apt/lists/* + + COPY --from=builder /opt/src/pgloader/build/bin/pgloader /usr/local/bin + + LABEL maintainer="Dimitri Fontaine " \ No newline at end of file diff --git a/Dockerfile.ccl b/Dockerfile.ccl index a33f8c9..f88468a 100644 --- a/Dockerfile.ccl +++ b/Dockerfile.ccl @@ -1,25 +1,49 @@ -FROM debian:stretch -MAINTAINER Dimitri Fontaine +FROM debian:stable-slim as builder -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - wget curl make git bzip2 time \ - ca-certificates \ - libzip-dev libssl1.1 openssl \ - patch unzip libsqlite3-dev gawk \ - freetds-dev sbcl && \ - rm -rf /var/lib/apt/lists/* + RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bzip2 \ + ca-certificates \ + curl \ + freetds-dev \ + gawk \ + git \ + libsqlite3-dev \ + libssl1.1 \ + libzip-dev \ + make \ + openssl \ + patch \ + sbcl \ + time \ + unzip \ + wget \ + && rm -rf /var/lib/apt/lists/* -WORKDIR /usr/local/src -RUN curl --location -O https://github.com/Clozure/ccl/releases/download/v1.11.5/ccl-1.11.5-linuxx86.tar.gz -RUN tar xf ccl-1.11.5-linuxx86.tar.gz -RUN cp /usr/local/src/ccl/scripts/ccl64 /usr/local/bin/ccl + RUN curl -SL https://github.com/Clozure/ccl/releases/download/v1.11.5/ccl-1.11.5-linuxx86.tar.gz \ + | tar xz -C /usr/local/src/ \ + && mv /usr/local/src/ccl/scripts/ccl64 /usr/local/bin/ccl -ADD ./ /opt/src/pgloader -WORKDIR /opt/src/pgloader + COPY ./ /opt/src/pgloader -# build/ is in the .dockerignore file, but we actually need it now -RUN mkdir -p build/bin -RUN make CL=ccl DYNSIZE=256 + RUN mkdir -p /opt/src/pgloader/build/bin \ + && cd /opt/src/pgloader \ + && make CL=ccl DYNSIZE=256 -RUN cp /opt/src/pgloader/build/bin/pgloader /usr/local/bin +FROM debian:stable-slim + + RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + freetds-dev \ + gawk \ + libsqlite3-dev \ + libzip-dev \ + make \ + sbcl \ + unzip \ + && rm -rf /var/lib/apt/lists/* + + COPY --from=builder /opt/src/pgloader/build/bin/pgloader /usr/local/bin + + LABEL maintainer="Dimitri Fontaine " \ No newline at end of file From 1ee389d1210f207dcc66ab704db5ccb6de897246 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 14 Aug 2018 10:06:45 +0300 Subject: [PATCH 04/69] Fix parsing empty hostname fields in pgpass. Fixes #823. --- src/parsers/parse-pgpass.lisp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parsers/parse-pgpass.lisp b/src/parsers/parse-pgpass.lisp index 3eaaa7e..82efbee 100644 --- a/src/parsers/parse-pgpass.lisp +++ b/src/parsers/parse-pgpass.lisp @@ -19,11 +19,11 @@ (pgpass-char-p character)))) (:lambda (e) (text e))) -(defrule pgpass-line (and pgpass-entry #\: pgpass-entry #\: +(defrule pgpass-line (and (? pgpass-entry) #\: pgpass-entry #\: pgpass-entry #\: pgpass-entry #\: (? pgpass-entry)) (:lambda (pl) - (make-pgpass :hostname (first pl) + (make-pgpass :hostname (or (first pl) "localhost") :port (third pl) :database (fifth pl) :username (seventh pl) From fc3a1949f74bbfbebdbc023d6e55dc15e5d6df33 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 20 Aug 2018 11:09:52 +0200 Subject: [PATCH 05/69] Add support for PostgreSQL as a source database. It's now possible to use pgloader to migrate from PostgreSQL to PostgreSQL. That might be useful for several reasons, including applying user defined cast rules at COPY time, or just moving from an hosted solution to another. --- pgloader.asd | 9 +- src/load/migrate-database.lisp | 6 + src/package.lisp | 29 ++++- src/parsers/command-parser.lisp | 6 +- src/parsers/command-pgsql.lisp | 159 ++++++++++++++++++++++++ src/pgsql/pgsql-create-schema.lisp | 25 ++-- src/pgsql/pgsql-ddl.lisp | 63 +++++++--- src/pgsql/pgsql-schema.lisp | 90 ++++++++++++-- src/pgsql/sql/list-all-columns.sql | 24 +++- src/pgsql/sql/list-all-extensions.sql | 4 + src/pgsql/sql/list-all-sqltypes.sql | 43 +++++++ src/sources/pgsql/pgsql-cast-rules.lisp | 48 +++++++ src/sources/pgsql/pgsql.lisp | 90 ++++++++++++++ src/sources/sqlite/sqlite.lisp | 2 +- src/utils/catalog.lisp | 93 ++++++++++++-- test/archive.load | 3 +- test/pgsql-source.load | 6 + 17 files changed, 639 insertions(+), 61 deletions(-) create mode 100644 src/parsers/command-pgsql.lisp create mode 100644 src/pgsql/sql/list-all-extensions.sql create mode 100644 src/pgsql/sql/list-all-sqltypes.sql create mode 100644 src/sources/pgsql/pgsql-cast-rules.lisp create mode 100644 src/sources/pgsql/pgsql.lisp create mode 100644 test/pgsql-source.load diff --git a/pgloader.asd b/pgloader.asd index 55468e4..89db8c8 100644 --- a/pgloader.asd +++ b/pgloader.asd @@ -182,7 +182,13 @@ ;; :depends-on ("mysql-schema")) (:file "mysql" :depends-on ("mysql-cast-rules" - "mysql-schema")))))) + "mysql-schema")))) + + (:module "pgsql" + :serial t + :depends-on ("common") + :components ((:file "pgsql-cast-rules") + (:file "pgsql"))))) ;; package pgloader.copy (:module "pg-copy" @@ -247,6 +253,7 @@ (:file "command-including-like") (:file "command-mssql") (:file "command-sqlite") + (:file "command-pgsql") (:file "command-archive") (:file "command-parser") (:file "parse-sqlite-type-name") diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index e4c39bf..044d931 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -46,6 +46,12 @@ (with-stats-collection ("Create SQL Types" :section :pre :use-result-as-read t :use-result-as-rows t) + ;; some SQL types come from extensions (ip4r, hstore, etc) + (create-extensions catalog + :include-drop include-drop + :if-not-exists t + :client-min-messages :error) + (create-sqltypes catalog :include-drop include-drop :client-min-messages :error)) diff --git a/src/package.lisp b/src/package.lisp index 1cde4df..8e81cdf 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -49,8 +49,9 @@ #:catalog #:schema - #:table + #:extension #:sqltype + #:table #:column #:index #:fkey @@ -82,6 +83,8 @@ #:schema-source-name #:schema-table-list #:schema-view-list + #:schema-extension-list + #:schema-sqltype-list #:schema-in-search-path #:table-name @@ -96,11 +99,15 @@ #:table-fkey-list #:table-trigger-list + #:extension-name + #:extension-schema + #:sqltype-name #:sqltype-schema #:sqltype-type #:sqltype-source-def #:sqltype-extra + #:sqltype-extension #:column-name #:column-type-name @@ -110,6 +117,7 @@ #:column-comment #:column-transform #:column-extra + #:column-transform-default #:index-name #:index-type @@ -152,9 +160,15 @@ #:table-list #:view-list + #:extension-list + #:sqltype-list #:add-schema #:find-schema #:maybe-add-schema + #:add-extension + #:find-extension + #:maybe-add-extension + #:add-sqltype #:add-table #:find-table #:maybe-add-table @@ -389,6 +403,7 @@ #:truncate-tables #:set-table-oids + #:create-extensions #:create-sqltypes #:create-schemas #:add-to-search-path @@ -417,6 +432,7 @@ #:process-index-definitions ;; postgresql introspection queries + #:list-all-sqltypes #:list-all-columns #:list-all-indexes #:list-all-fkeys @@ -674,6 +690,14 @@ #:*mysql-default-cast-rules* #:with-mysql-connection)) +(defpackage #:pgloader.source.pgsql + (:use #:cl + #:pgloader.params #:pgloader.utils #:pgloader.connection + #:pgloader.sources #:pgloader.pgsql #:pgloader.catalog) + (:import-from #:pgloader.transforms #:precision #:scale) + (:export #:copy-pgsql + #:*pgsql-default-cast-rules*)) + (defpackage #:pgloader.source.sqlite (:use #:cl #:pgloader.params #:pgloader.utils #:pgloader.connection @@ -763,6 +787,9 @@ (:import-from #:pgloader.source.copy #:copy-copy #:copy-connection) + (:import-from #:pgloader.source.pgsql + #:copy-pgsql + #:*pgsql-default-cast-rules*) (:import-from #:pgloader.source.mysql #:copy-mysql #:mysql-connection diff --git a/src/parsers/command-parser.lisp b/src/parsers/command-parser.lisp index 57e244d..1484b0a 100644 --- a/src/parsers/command-parser.lisp +++ b/src/parsers/command-parser.lisp @@ -17,6 +17,7 @@ load-copy-file load-dbf-file load-ixf-file + load-pgsql-database load-mysql-database load-mssql-database load-sqlite-database @@ -160,12 +161,12 @@ (declare (ignore abs paths no-path-p)) (let ((dotted-parts (reverse (sq:split-sequence #\. filename)))) (when (<= 2 (length dotted-parts)) - (destructuring-bind (extension name-or-ext &rest parts) + (destructuring-bind (ext name-or-ext &rest parts) dotted-parts (declare (ignore parts)) (if (string-equal "tar" name-or-ext) :archive (loop :for (type . extensions) :in *data-source-filename-extensions* - :when (member extension extensions :test #'string-equal) + :when (member ext extensions :test #'string-equal) :return type))))))) (defvar *parse-rule-for-source-types* @@ -266,6 +267,7 @@ (:dbf 'dbf-option) (:ixf 'ixf-option) (:sqlite 'sqlite-option) + (:pgsql 'pgsql-option) (:mysql 'mysql-option) (:mssql 'mysql-option)) option)))) diff --git a/src/parsers/command-pgsql.lisp b/src/parsers/command-pgsql.lisp new file mode 100644 index 0000000..2a09fd7 --- /dev/null +++ b/src/parsers/command-pgsql.lisp @@ -0,0 +1,159 @@ +;;; +;;; Parse the pgloader commands grammar +;;; + +(in-package :pgloader.parser) + +;;; +;;; PostgreSQL options +;;; +(defrule pgsql-option (or option-on-error-stop + option-on-error-resume-next + option-workers + option-concurrency + option-batch-rows + option-batch-size + option-prefetch-rows + option-max-parallel-create-index + option-reindex + option-truncate + option-disable-triggers + option-data-only + option-schema-only + option-include-drop + option-drop-schema + option-create-tables + option-create-indexes + option-index-names + option-reset-sequences + option-foreign-keys + option-identifiers-case)) + +(defrule pgsql-options (and kw-with + (and pgsql-option (* (and comma pgsql-option)))) + (:function flatten-option-list)) + + +;;; +;;; Including only some tables or excluding some others +;;; +(defrule including-matching-in-schema-filter + (and kw-including kw-only kw-table kw-names kw-matching filter-list-matching + kw-in kw-schema quoted-namestring) + (:lambda (source) + (bind (((_ _ _ _ _ filter-list _ _ schema) source)) + (cons schema filter-list)))) + +(defrule including-matching-in-schema + (and including-in-schema (* including-in-schema)) + (:lambda (source) + (destructuring-bind (inc1 incs) source + (cons :including (list* inc1 incs))))) + +(defrule excluding-matching-in-schema-filter + (and kw-excluding kw-table kw-names kw-matching filter-list-matching + kw-in kw-schema quoted-namestring) + (:lambda (source) + (bind (((_ _ _ _ filter-list _ _ schema) source)) + (cons schema filter-list)))) + +(defrule excluding-matching-in-schema + (and excluding-in-schema (* excluding-in-schema)) + (:lambda (source) + (destructuring-bind (excl1 excls) source + (cons :excluding (list* excl1 excls))))) + + +;;; +;;; Allow clauses to appear in any order +;;; +(defrule load-pgsql-optional-clauses (* (or pgsql-options + gucs + casts + alter-table + alter-schema + materialize-views + including-matching-in-schema + excluding-matching-in-schema + decoding-tables-as + before-load + after-load)) + (:lambda (clauses-list) + (alexandria:alist-plist clauses-list))) + +(defrule pgsql-source (and kw-load kw-database kw-from pgsql-uri) + (:lambda (source) (bind (((_ _ _ uri) source)) uri))) + +(defrule load-pgsql-command (and pgsql-source target + load-pgsql-optional-clauses) + (:lambda (command) + (destructuring-bind (source target clauses) command + `(,source ,target ,@clauses)))) + + +;;; LOAD DATABASE FROM pgsql:// +(defun lisp-code-for-pgsql-dry-run (pg-src-db-conn pg-dst-db-conn) + `(lambda () + (log-message :log "DRY RUN, only checking connections.") + (check-connection ,pg-src-db-conn) + (check-connection ,pg-dst-db-conn))) + +(defun lisp-code-for-loading-from-pgsql (pg-src-db-conn pg-dst-db-conn + &key + gucs + casts before after options + alter-table alter-schema + ((:including incl)) + ((:excluding excl)) + ((:decoding decoding-as)) + &allow-other-keys) + `(lambda () + (let* ((*default-cast-rules* ',*pgsql-default-cast-rules*) + (*cast-rules* ',casts) + (*identifier-case* :quote) + (on-error-stop (getf ',options :on-error-stop t)) + ,@(pgsql-connection-bindings pg-dst-db-conn gucs) + ,@(batch-control-bindings options) + (source + (make-instance 'copy-pgsql + :target-db ,pg-dst-db-conn + :source-db ,pg-src-db-conn))) + + ,(sql-code-block pg-dst-db-conn :pre before "before load") + + (copy-database source + :including ',incl + :excluding ',excl + :alter-table ',alter-table + :alter-schema ',alter-schema + :index-names :preserve + :set-table-oids t + :on-error-stop on-error-stop + ,@(remove-batch-control-option options)) + + ,(sql-code-block pg-dst-db-conn :post after "after load")))) + +(defrule load-pgsql-database load-pgsql-command + (:lambda (source) + (destructuring-bind (pg-src-db-uri + pg-dst-db-uri + &key + gucs casts before after options + alter-table alter-schema + including excluding decoding) + source + (cond (*dry-run* + (lisp-code-for-pgsql-dry-run pg-src-db-uri pg-dst-db-uri)) + (t + (lisp-code-for-loading-from-pgsql pg-src-db-uri pg-dst-db-uri + :gucs gucs + :casts casts + :before before + :after after + :options options + :alter-table alter-table + :alter-schema alter-schema + :including including + :excluding excluding + :decoding decoding)))))) + diff --git a/src/pgsql/pgsql-create-schema.lisp b/src/pgsql/pgsql-create-schema.lisp index dd490ac..e6154e5 100644 --- a/src/pgsql/pgsql-create-schema.lisp +++ b/src/pgsql/pgsql-create-schema.lisp @@ -13,17 +13,7 @@ include-drop (client-min-messages :notice)) "Create the needed data types for given CATALOG." - (let ((sqltype-list)) - ;; build the sqltype list - (loop :for table :in (append (table-list catalog) - (view-list catalog)) - :do (loop :for column :in (table-column-list table) - :do (when (typep (column-type-name column) 'sqltype) - (pushnew (column-type-name column) sqltype-list - :test #'string-equal - :key #'sqltype-name)))) - - ;; now create the types + (let ((sqltype-list (sqltype-list catalog))) (loop :for sqltype :in sqltype-list :when include-drop :count t @@ -114,6 +104,19 @@ :log-level log-level :client-min-messages client-min-messages))))) +(defun create-extensions (catalog + &key + if-not-exists + include-drop + (client-min-messages :notice)) + "Create all extensions from the given database CATALOG." + (let ((sql + (loop :for extension :in (extension-list catalog) + :when include-drop + :collect (format-drop-sql extension :if-exists t :cascade t) + :collect (format-create-sql extension :if-not-exists if-not-exists)))) + (pgsql-execute sql :client-min-messages client-min-messages))) + (defun create-tables (catalog &key if-not-exists diff --git a/src/pgsql/pgsql-ddl.lisp b/src/pgsql/pgsql-ddl.lisp index fa29e27..580618e 100644 --- a/src/pgsql/pgsql-ddl.lisp +++ b/src/pgsql/pgsql-ddl.lisp @@ -38,6 +38,25 @@ (sqltype-name sqltype) cascade)) + +;;; +;;; Extensions +;;; +(defmethod format-create-sql ((extension extension) + &key (stream nil) if-not-exists) + (format stream "CREATE EXTENSION~:[~; IF NOT EXISTS~] ~a WITH SCHEMA ~a;" + if-not-exists + (extension-name extension) + (schema-name (extension-schema extension)))) + +(defmethod format-drop-sql ((extension extension) + &key (stream nil) cascade if-exists) + (format stream "DROP EXTENSION~:[~; IF EXISTS~] ~a~@[ CASCADE~];" + if-exists + (extension-name extension) + cascade)) + + ;;; ;;; Tables @@ -126,26 +145,30 @@ "Common normalized default values and their PostgreSQL spelling.") (defmethod format-default-value ((column column) &key (stream nil)) - (let* ((default (column-default column)) - (clean-default (cdr (assoc default *pgsql-default-values*))) - (transform (column-transform column))) - (or clean-default - (if transform - (let* ((transformed-default - (handler-case - (funcall transform default) - (condition (c) - (log-message :warning - "Failed to transform default value ~s: ~a" - default c) - ;; can't transform: return nil - nil))) - (transformed-column - (make-column :default transformed-default))) - (format-default-value transformed-column)) - (if default - (ensure-quoted default #\') - (format stream "NULL")))))) + (if (column-transform-default column) + (let* ((default (column-default column)) + (clean-default (cdr (assoc default *pgsql-default-values*))) + (transform (column-transform column))) + (or clean-default + (if transform + (let* ((transformed-default + (handler-case + (funcall transform default) + (condition (c) + (log-message :warning + "Failed to transform default value ~s: ~a" + default c) + ;; can't transform: return nil + nil))) + (transformed-column + (make-column :default transformed-default))) + (format-default-value transformed-column)) + (if default + (ensure-quoted default #\') + (format stream "NULL"))))) + + ;; else, when column-transform-default is nil: + (column-default column))) ;;; diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index 8bdf158..e5ce1af 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -19,6 +19,10 @@ (t including)))) + (list-all-sqltypes catalog + :including including + :excluding excluding) + (list-all-columns catalog :table-type :table :including including @@ -116,18 +120,34 @@ "Associate internal table type symbol with what's found in PostgreSQL pg_class.relkind column.") -(defun filter-list-to-where-clause (filter-list +(defun filter-list-to-where-clause (schema-filter-list &optional not (schema-col "table_schema") (table-col "table_name")) "Given an INCLUDING or EXCLUDING clause, turn it into a PostgreSQL WHERE clause." - (loop :for (schema . table-name-list) :in filter-list - :append (mapcar (lambda (table-name) - (format nil "(~a = '~a' and ~a ~:[~;NOT ~]~~ '~a')" - schema-col schema table-col not table-name)) - table-name-list))) + (loop :for (schema . filter-list) :in schema-filter-list + :append (mapcar (lambda (filter) + (typecase filter + (string-match-rule + (format nil "(~a = '~a' and ~a ~:[~;!~]= '~a')" + schema-col + schema + table-col + not + (string-match-rule-target filter))) + (regex-match-rule + (format nil "(~a = '~a' and ~a ~:[~;NOT ~]~~ '~a')" + schema-col + schema + table-col + not + (regex-match-rule-target filter))))) + filter-list))) + +(defun normalize-extra (extra) + (cond ((string= "auto_increment" extra) :auto-increment))) (defun list-all-columns (catalog &key @@ -137,7 +157,8 @@ &aux (table-type-name (cdr (assoc table-type *table-type*)))) "Get the list of PostgreSQL column names per table." - (loop :for (schema-name table-name table-oid name type typmod notnull default) + (loop :for (schema-name table-name table-oid + name type typmod notnull default extra) :in (query nil (format nil @@ -160,7 +181,9 @@ :type-name type :type-mod typmod :nullable (not notnull) - :default default))) + :default default + :transform-default nil + :extra (normalize-extra extra)))) (add-field table field)) :finally (return catalog))) @@ -187,7 +210,7 @@ (tschema (find-schema catalog table-schema)) (table (find-table tschema table-name)) (pg-index - (make-index :name name + (make-index :name (ensure-quoted name) :oid oid :schema schema :table table @@ -195,8 +218,10 @@ :unique unique :columns nil :sql sql - :conname (unless (eq :null conname) conname) - :condef (unless (eq :null condef) condef)))) + :conname (unless (eq :null conname) + (ensure-quoted conname)) + :condef (unless (eq :null condef) + condef)))) (maybe-add-index table name pg-index :key #'index-name)) :finally (return catalog))) @@ -247,7 +272,7 @@ (fschema (find-schema catalog fschema-name)) (ftable (find-table fschema ftable-name)) (fk - (make-fkey :name conname + (make-fkey :name (ensure-quoted conname) :oid conoid :condef condef :table table @@ -355,3 +380,44 @@ (sql "/pgsql/list-table-oids-from-temp-table.sql")))) :do (setf (gethash name oidmap) oid))) oidmap)) + + + +;;; +;;; PostgreSQL specific support for extensions and user defined data types. +;;; +(defun list-all-sqltypes (catalog &key including excluding) + "Set the catalog's schema extension list and sqltype list" + (loop :for (schema-name extension-name type-name enum-values) + :in (query nil + (format nil + (sql "/pgsql/list-all-sqltypes.sql") + including ; do we print the clause? + (filter-list-to-where-clause including + nil + "n.nspname" + "c.relname") + excluding ; do we print the clause? + (filter-list-to-where-clause excluding + nil + "n.nspname" + "c.relname"))) + :do + (let* ((schema (maybe-add-schema catalog schema-name)) + (sqltype + (make-sqltype :name (ensure-quoted type-name) + :schema schema + :type (when enum-values :enum) + :extra (when (and enum-values + (not (eq enum-values :null))) + (coerce enum-values 'list))))) + + (if (and extension-name (not (eq :null extension-name))) + ;; then create extension will create the type + (maybe-add-extension schema extension-name) + + ;; only create a specific entry for types that we need to create + ;; ourselves, when extension is not null "create extension" is + ;; going to take care of creating the type. + (add-sqltype schema sqltype))) + :finally (return catalog))) diff --git a/src/pgsql/sql/list-all-columns.sql b/src/pgsql/sql/list-all-columns.sql index d3223e1..8875c4d 100644 --- a/src/pgsql/sql/list-all-columns.sql +++ b/src/pgsql/sql/list-all-columns.sql @@ -3,17 +3,37 @@ -- filter-list-to-where-clause for including -- excluding -- filter-list-to-where-clause for excluding +with seqattr as + ( + select adrelid, + adnum, + adsrc, + case when adsrc ~~ 'nextval' + then (regexp_match(pg_get_expr(d.adbin, d.adrelid), + '''([^'']+)''') + )[1]::regclass::oid + else null::oid + end as seqoid + from pg_attrdef d + ) select nspname, relname, c.oid, attname, t.oid::regtype as type, - case when atttypmod > 0 then atttypmod - 4 else null end as typmod, + case when atttypmod > 0 + then substring(format_type(t.oid, atttypmod) from '\d+(?:,\d+)?') + else null + end as typmod, attnotnull, - case when atthasdef then def.adsrc end as default + case when atthasdef then def.adsrc end as default, + case when s.seqoid is not null then 'auto_increment' end as extra from pg_class c join pg_namespace n on n.oid = c.relnamespace left join pg_attribute a on c.oid = a.attrelid join pg_type t on t.oid = a.atttypid and attnum > 0 left join pg_attrdef def on a.attrelid = def.adrelid and a.attnum = def.adnum + and a.atthasdef + left join seqattr s on def.adrelid = s.adrelid + and def.adnum = s.adnum where nspname !~~ '^pg_' and n.nspname <> 'information_schema' and relkind in (~{'~a'~^, ~}) diff --git a/src/pgsql/sql/list-all-extensions.sql b/src/pgsql/sql/list-all-extensions.sql new file mode 100644 index 0000000..00a9aff --- /dev/null +++ b/src/pgsql/sql/list-all-extensions.sql @@ -0,0 +1,4 @@ +select nspname, extname + from pg_extension e + join pg_namespace n on n.oid = e.extnamespace + where nspname !~ '^pg_'; diff --git a/src/pgsql/sql/list-all-sqltypes.sql b/src/pgsql/sql/list-all-sqltypes.sql new file mode 100644 index 0000000..cfaf791 --- /dev/null +++ b/src/pgsql/sql/list-all-sqltypes.sql @@ -0,0 +1,43 @@ +-- +-- get user defined SQL types +-- + select nt.nspname, + extname, + typname, + case when enum.enumtypid is not null + then array_agg(enum.enumlabel order by enumsortorder) + end as enumvalues + + from pg_class c + join pg_namespace n on n.oid = c.relnamespace + left join pg_attribute a on c.oid = a.attrelid and a.attnum > 0 + join pg_type t on t.oid = a.atttypid + left join pg_namespace nt on nt.oid = t.typnamespace + left join pg_depend d on d.classid = 'pg_type'::regclass + and d.refclassid = 'pg_extension'::regclass + and d.objid = t.oid + left join pg_extension e on refobjid = e.oid + left join pg_enum enum on enum.enumtypid = t.oid + + where nt.nspname !~~ '^pg_' and nt.nspname <> 'information_schema' + and n.nspname !~~ '^pg_' and n.nspname <> 'information_schema' + and c.relkind in ('r', 'f', 'p') + ~:[~*~;and (~{~a~^~&~10t or ~})~] + ~:[~*~;and (~{~a~^~&~10t and ~})~] + and + ( t.typrelid = 0 + or + (select c.relkind = 'c' + from pg_class c + where c.oid = t.typrelid) + ) + and not exists + ( + select 1 + from pg_type el + where el.oid = t.typelem + and el.typarray = t.oid + ) + +group by nt.nspname, extname, typname, enumtypid +order by nt.nspname, extname, typname, enumtypid; diff --git a/src/sources/pgsql/pgsql-cast-rules.lisp b/src/sources/pgsql/pgsql-cast-rules.lisp new file mode 100644 index 0000000..2ef0373 --- /dev/null +++ b/src/sources/pgsql/pgsql-cast-rules.lisp @@ -0,0 +1,48 @@ +;;; +;;; Tools to handle PostgreSQL data type casting rules +;;; + +(in-package :pgloader.source.pgsql) + +(defparameter *pgsql-default-cast-rules* + '((:source (:type "integer" :auto-increment t) + :target (:type "serial" :drop-default t)) + + (:source (:type "bigint" :auto-increment t) + :target (:type "bigserial" :drop-default t))) + "Data Type Casting to migrate from PostgtreSQL to PostgreSQL") + +(defmethod pgsql-column-ctype ((column column)) + "Build the ctype definition from the PostgreSQL column information." + (let ((type-name (column-type-name column)) + (type-mod (unless (or (null (column-type-mod column)) + (eq :null (column-type-mod column))) + (column-type-mod column)))) + (format nil "~a~@[(~a)~]" type-name type-mod))) + +(defmethod cast ((field column) &key &allow-other-keys) + "Return the PostgreSQL type definition from the given PostgreSQL column + definition" + (with-slots (pgloader.catalog::name + pgloader.catalog::type-name + pgloader.catalog::type-mod + pgloader.catalog::nullable + pgloader.catalog::default + pgloader.catalog::comment + pgloader.catalog::transform + pgloader.catalog::extra) + field + (let* ((ctype (pgsql-column-ctype field)) + (pgcol (apply-casting-rules nil + pgloader.catalog::name + pgloader.catalog::type-name + ctype + pgloader.catalog::default + pgloader.catalog::nullable + pgloader.catalog::extra))) + ;; re-install our instruction not to transform default value: it comes + ;; from PostgreSQL, and we trust it. + (setf (column-transform-default pgcol) + (column-transform-default field)) + + pgcol))) diff --git a/src/sources/pgsql/pgsql.lisp b/src/sources/pgsql/pgsql.lisp new file mode 100644 index 0000000..e8cab7b --- /dev/null +++ b/src/sources/pgsql/pgsql.lisp @@ -0,0 +1,90 @@ +;;; +;;; Read from a PostgreSQL database. +;;; + +(in-package :pgloader.source.pgsql) + +(defclass copy-pgsql (db-copy) () + (:documentation "pgloader PostgreSQL Data Source")) + +(defmethod initialize-instance :after ((source copy-pgsql) &key) + "Add a default value for transforms in case it's not been provided." + (let* ((transforms (when (slot-boundp source 'transforms) + (slot-value source 'transforms)))) + (when (and (slot-boundp source 'fields) (slot-value source 'fields)) + ;; cast typically happens in copy-database in the schema structure, + ;; and the result is then copied into the copy-mysql instance. + (unless (and (slot-boundp source 'columns) (slot-value source 'columns)) + (setf (slot-value source 'columns) + (mapcar #'cast (slot-value source 'fields)))) + + (unless transforms + (setf (slot-value source 'transforms) + (mapcar #'column-transform (slot-value source 'columns))))))) + +(defmethod map-rows ((pgsql copy-pgsql) &key process-row-fn) + "Extract PostgreSQL data and call PROCESS-ROW-FN function with a single + argument (a list of column values) for each row" + (let ((map-reader + ;; + ;; Build a Postmodern row reader that prepares a vector of strings + ;; and call PROCESS-ROW-FN with the vector as single argument. + ;; + (cl-postgres:row-reader (fields) + (let ((nb-cols (length fields))) + (loop :while (cl-postgres:next-row) + :do (let ((row (make-array nb-cols))) + (loop :for i :from 0 + :for field :across fields + :do (setf (aref row i) + (cl-postgres:next-field field))) + (funcall process-row-fn row))))))) + + (with-pgsql-connection ((source-db pgsql)) + (let* ((cols (mapcar #'column-name (fields pgsql))) + (sql + (format nil "SELECT ~{~s::text~^, ~} FROM ~s.~s" cols + (schema-source-name (table-schema (source pgsql))) + (table-source-name (source pgsql))))) + (cl-postgres:exec-query pomo:*database* sql map-reader))))) + +(defmethod fetch-metadata ((pgsql copy-pgsql) + (catalog catalog) + &key + materialize-views + only-tables + create-indexes + foreign-keys + including + excluding) + "PostgreSQL introspection to prepare the migration." + (declare (ignore materialize-views only-tables)) + (with-stats-collection ("fetch meta data" + :use-result-as-rows t + :use-result-as-read t + :section :pre) + (with-pgsql-transaction (:pgconn (source-db pgsql)) + (list-all-sqltypes catalog + :including including + :excluding excluding) + + (list-all-columns catalog + :including including + :excluding excluding) + + (when create-indexes + (list-all-indexes catalog + :including including + :excluding excluding)) + + (when foreign-keys + (list-all-fkeys catalog + :including including + :excluding excluding)) + + ;; return how many objects we're going to deal with in total + ;; for stats collection + (+ (count-tables catalog) (count-indexes catalog)))) + + ;; be sure to return the catalog itself + catalog) diff --git a/src/sources/sqlite/sqlite.lisp b/src/sources/sqlite/sqlite.lisp index f6f97de..99e1ab7 100644 --- a/src/sources/sqlite/sqlite.lisp +++ b/src/sources/sqlite/sqlite.lisp @@ -96,7 +96,7 @@ "Send the data in the SQLite column ordering." (mapcar #'apply-identifier-case (mapcar #'coldef-name (fields sqlite)))) -(defmethod fetch-metadata (sqlite catalog +(defmethod fetch-metadata ((sqlite copy-sqlite) (catalog catalog) &key materialize-views only-tables diff --git a/src/utils/catalog.lisp b/src/utils/catalog.lisp index c81758f..76a4857 100644 --- a/src/utils/catalog.lisp +++ b/src/utils/catalog.lisp @@ -43,25 +43,35 @@ ;;; implemented in each source separately. ;;; (defstruct catalog name schema-list types-without-btree) -(defstruct schema source-name name catalog table-list view-list in-search-path) + +(defstruct schema source-name name catalog in-search-path + table-list view-list extension-list sqltype-list) + (defstruct table source-name name schema oid comment storage-parameter-list ;; field is for SOURCE ;; column is for TARGET field-list column-list index-list fkey-list trigger-list) +;;; +;;; When migrating from PostgreSQL to PostgreSQL we might have to install +;;; extensions to have data type coverage. +;;; +(defstruct extension name schema) + ;;; ;;; When migrating from another database to PostgreSQL some data types might ;;; need to be tranformed dynamically into User Defined Types: ENUMs, SET, ;;; etc. ;;; -(defstruct sqltype name schema type source-def extra) +(defstruct sqltype name schema type source-def extra extension) ;;; ;;; The generic PostgreSQL column that the CAST generic function is asked to ;;; produce, so that we know how to CREATE TABLEs in PostgreSQL whatever the ;;; source is. ;;; -(defstruct column name type-name type-mod nullable default comment transform extra) +(defstruct column name type-name type-mod nullable default comment + transform extra (transform-default t)) ;;; ;;; Index and Foreign Keys @@ -94,13 +104,18 @@ ;;; ;;; Main data collection API ;;; -(defgeneric add-schema (object schema-name &key)) -(defgeneric add-table (object table-name &key)) -(defgeneric add-view (object view-name &key)) -(defgeneric add-column (object column &key)) -(defgeneric add-index (object index &key)) -(defgeneric add-fkey (object fkey &key)) -(defgeneric add-comment (object comment &key)) +(defgeneric add-schema (object schema-name &key)) +(defgeneric add-extension (object extension-name &key)) +(defgeneric add-table (object table-name &key)) +(defgeneric add-view (object view-name &key)) +(defgeneric add-sqltype (object column &key)) +(defgeneric add-column (object column &key)) +(defgeneric add-index (object index &key)) +(defgeneric add-fkey (object fkey &key)) +(defgeneric add-comment (object comment &key)) + +(defgeneric extension-list (object &key) + (:documentation "Return the list of extensions found in OBJECT.")) (defgeneric table-list (object &key) (:documentation "Return the list of tables found in OBJECT.")) @@ -112,6 +127,10 @@ (:documentation "Find a schema by SCHEMA-NAME in a catalog OBJECT and return the schema")) +(defgeneric find-extension (object extension-name &key) + (:documentation + "Find an extension by EXTENSION-NAME in a schema OBJECT and return the table")) + (defgeneric find-table (object table-name &key) (:documentation "Find a table by TABLE-NAME in a schema OBJECT and return the table")) @@ -131,6 +150,9 @@ (defgeneric maybe-add-schema (object schema-name &key) (:documentation "Add a new schema or return existing one.")) +(defgeneric maybe-add-extension (object extension-name &key) + (:documentation "Add a new extension or return existing one.")) + (defgeneric maybe-add-table (object table-name &key) (:documentation "Add a new table or return existing one.")) @@ -167,6 +189,35 @@ ;;; ;;; Implementation of the methods ;;; +(defmethod extension-list ((schema schema) &key) + "Return the list of extensions for SCHEMA." + (schema-extension-list schema)) + +(defmethod extension-list ((catalog catalog) &key) + "Return the list of extensions for CATALOG." + (apply #'append (mapcar #'extension-list (catalog-schema-list catalog)))) + +(defmethod sqltype-list ((column column) &key) + "Return the list of sqltypes for SCHEMA." + (when (typep (column-type-name column) 'sqltype) + (column-type-name column))) + +(defmethod sqltype-list ((table table) &key) + "Return the list of sqltypes for SCHEMA." + (apply #'append (mapcar #'sqltype-list (table-column-list table)))) + +(defmethod sqltype-list ((schema schema) &key) + "Return the list of sqltypes for SCHEMA." + (append (schema-sqltype-list schema) + (apply #'append + (mapcar #'sqltype-list (schema-table-list schema))))) + +(defmethod sqltype-list ((catalog catalog) &key) + "Return the list of sqltypes for CATALOG." + (remove-duplicates + (apply #'append (mapcar #'sqltype-list (catalog-schema-list catalog))) + :test #'string-equal :key #'sqltype-name)) + (defmethod table-list ((schema schema) &key) "Return the list of tables for SCHEMA." (schema-table-list schema)) @@ -212,6 +263,17 @@ :in-search-path in-search-path))) (push-to-end schema (catalog-schema-list catalog)))) +(defmethod add-extension ((schema schema) extension-name &key) + "Add EXTENSION-NAME to SCHEMA and return the new extension instance." + (let ((extension + (make-extension :name extension-name + :schema schema))) + (push-to-end extension (schema-extension-list schema)))) + +(defmethod add-sqltype ((schema schema) sqltype &key) + "Add SQLTYPE instance to SCHEMA and return SQLTYPE." + (push-to-end sqltype (schema-sqltype-list schema))) + (defmethod add-table ((schema schema) table-name &key comment oid) "Add TABLE-NAME to SCHEMA and return the new table instance." (let ((table @@ -238,6 +300,11 @@ (find schema-name (catalog-schema-list catalog) :key #'schema-source-name :test 'string=)) +(defmethod find-extension ((schema schema) extension-name &key) + "Find EXTENSION-NAME in SCHEMA and return the EXTENSION object of this name." + (find extension-name (schema-extension-list schema) + :key #'extension-name :test 'string=)) + (defmethod find-table ((schema schema) table-name &key) "Find TABLE-NAME in SCHEMA and return the TABLE object of this name." (find table-name (schema-table-list schema) @@ -254,6 +321,12 @@ (let ((schema (find-schema catalog schema-name))) (or schema (add-schema catalog schema-name)))) +(defmethod maybe-add-extension ((schema schema) extension-name &key) + "Add TABLE-NAME to the table-list for SCHEMA, or return the existing table + of the same name if it already exists in the schema table-list." + (let ((extension (find-extension schema extension-name))) + (or extension (add-extension schema extension-name)))) + (defmethod maybe-add-table ((schema schema) table-name &key comment oid) "Add TABLE-NAME to the table-list for SCHEMA, or return the existing table of the same name if it already exists in the schema table-list." diff --git a/test/archive.load b/test/archive.load index de0f6f5..3d97e14 100644 --- a/test/archive.load +++ b/test/archive.load @@ -8,7 +8,8 @@ */ LOAD ARCHIVE - FROM http://pgsql.tapoueh.org/temp/foo.zip + -- FROM http://pgsql.tapoueh.org/temp/foo.zip + FROM http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip INTO postgresql:///ip4r BEFORE LOAD diff --git a/test/pgsql-source.load b/test/pgsql-source.load new file mode 100644 index 0000000..7e74bc3 --- /dev/null +++ b/test/pgsql-source.load @@ -0,0 +1,6 @@ +load database + from pgsql://localhost/pgloader + into pgsql://localhost/copy + + -- including only table names matching 'bits', ~/utilisateur/ in schema 'mysql' + ; From d3bfb1db31cad2c10ff185ae8d891718c197163b Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 20 Aug 2018 11:50:50 +0200 Subject: [PATCH 06/69] Bugfix previous commit: filter list format changed. We now accept the more general string and regex match rules, but the code to generate including and excluding lists from the catalogs had not been updated. --- src/pgsql/pgsql-schema.lisp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index e5ce1af..b47c4e5 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -100,7 +100,7 @@ (defun format-table-name-as-including-exp (table) "Return a table name suitable for a catalog lookup using ~ operator." (let ((table-name (table-name table))) - (format nil "^~a$" (ensure-unquoted table-name)))) + (make-string-match-rule :target (ensure-unquoted table-name)))) (defun query-table-schema (table) "Get PostgreSQL schema name where to locate TABLE-NAME by following the From cb633aa092e83aa95b1e0483d6d6ce731bf6bdfe Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 20 Aug 2018 11:52:59 +0200 Subject: [PATCH 07/69] Refrain from some introspections on non-PGDG PostgreSQL variants. When dealing with PostgreSQL protocol compatible databases, often enough they don't support the same catalogs as PostgreSQL itself. Redshift for instance lacks foreign key support. --- src/load/load-file.lisp | 4 +++- src/pgsql/pgsql-schema.lisp | 30 ++++++++++++++++++------------ src/sources/pgsql/pgsql.lisp | 34 ++++++++++++++++++---------------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/src/load/load-file.lisp b/src/load/load-file.lisp index 19819e5..7d36d12 100644 --- a/src/load/load-file.lisp +++ b/src/load/load-file.lisp @@ -42,7 +42,9 @@ (handler-case (with-pgsql-connection (pgconn) (setf pgsql-catalog - (fetch-pgsql-catalog (db-name pgconn) :table (target copy))) + (fetch-pgsql-catalog (db-name pgconn) + :table (target copy) + :variant (pgconn-variant pgconn))) ;; if the user didn't tell us the column list of the table, now is ;; a proper time to set it in the copy object diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index b47c4e5..9ea3d59 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -5,7 +5,12 @@ (in-package :pgloader.pgsql) (defun fetch-pgsql-catalog (dbname - &key table source-catalog including excluding) + &key + table + source-catalog + including + excluding + (variant :pgdg)) "Fetch PostgreSQL catalogs for the target database. A PostgreSQL connection must be opened." (let* ((*identifier-case* :quote) @@ -18,10 +23,10 @@ (t including)))) - - (list-all-sqltypes catalog - :including including - :excluding excluding) + (when (eq :pgdg variant) + (list-all-sqltypes catalog + :including including + :excluding excluding)) (list-all-columns catalog :table-type :table @@ -32,14 +37,15 @@ :including including :excluding excluding) - (list-all-fkeys catalog - :including including - :excluding excluding) + (when (eq :pgdg variant) + (list-all-fkeys catalog + :including including + :excluding excluding) - ;; fetch fkey we depend on with UNIQUE indexes but that have been - ;; excluded from the target list, we still need to take care of them to - ;; be able to DROP then CREATE those indexes again - (list-missing-fk-deps catalog) + ;; fetch fkey we depend on with UNIQUE indexes but that have been + ;; excluded from the target list, we still need to take care of them to + ;; be able to DROP then CREATE those indexes again + (list-missing-fk-deps catalog)) (log-message :debug "fetch-pgsql-catalog: ~d tables, ~d indexes, ~d+~d fkeys" (count-tables catalog) diff --git a/src/sources/pgsql/pgsql.lisp b/src/sources/pgsql/pgsql.lisp index e8cab7b..8a45a58 100644 --- a/src/sources/pgsql/pgsql.lisp +++ b/src/sources/pgsql/pgsql.lisp @@ -64,27 +64,29 @@ :use-result-as-read t :section :pre) (with-pgsql-transaction (:pgconn (source-db pgsql)) - (list-all-sqltypes catalog + (let ((variant (pgconn-variant (source-db pgsql)))) + (when (eq :pgdg variant) + (list-all-sqltypes catalog + :including including + :excluding excluding)) + + (list-all-columns catalog :including including :excluding excluding) - (list-all-columns catalog - :including including - :excluding excluding) + (when create-indexes + (list-all-indexes catalog + :including including + :excluding excluding)) - (when create-indexes - (list-all-indexes catalog - :including including - :excluding excluding)) + (when (and (eq :pgdg variant) foreign-keys) + (list-all-fkeys catalog + :including including + :excluding excluding)) - (when foreign-keys - (list-all-fkeys catalog - :including including - :excluding excluding)) - - ;; return how many objects we're going to deal with in total - ;; for stats collection - (+ (count-tables catalog) (count-indexes catalog)))) + ;; return how many objects we're going to deal with in total + ;; for stats collection + (+ (count-tables catalog) (count-indexes catalog))))) ;; be sure to return the catalog itself catalog) From c9b905b7ac3fa008d7dfeaf7bde539b228ad3f3e Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 20 Aug 2018 11:55:47 +0200 Subject: [PATCH 08/69] Simplify our ASD system definition by using :serial t. This allows to drop manually maintained list of files dependencies, instead implying them by the order in which we list the files. --- pgloader.asd | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/pgloader.asd b/pgloader.asd index 89db8c8..12b1684 100644 --- a/pgloader.asd +++ b/pgloader.asd @@ -149,40 +149,30 @@ ;(:file "syslog") ; experimental... (:module "sqlite" + :serial t :depends-on ("common") :components ((:file "sqlite-cast-rules") - (:file "sqlite-schema" - :depends-on ("sqlite-cast-rules")) - (:file "sqlite" - :depends-on ("sqlite-cast-rules" - "sqlite-schema")))) + (:file "sqlite-schema") + (:file "sqlite"))) (:module "mssql" + :serial t :depends-on ("common") :components ((:file "mssql-cast-rules") - (:file "mssql-schema" - :depends-on ("mssql-cast-rules")) - (:file "mssql" - :depends-on ("mssql-cast-rules" - "mssql-schema")) - (:file "mssql-index-filters" - :depends-on ("mssql")))) + (:file "mssql-schema") + (:file "mssql") + (:file "mssql-index-filters"))) (:module "mysql" + :serial t :depends-on ("common") :components ((:file "mysql-cast-rules") (:file "mysql-connection") - (:file "mysql-schema" - :depends-on ("mysql-connection" - "mysql-cast-rules")) - ;; (:file "mysql-csv" - ;; :depends-on ("mysql-schema")) - (:file "mysql" - :depends-on ("mysql-cast-rules" - "mysql-schema")))) + (:file "mysql-schema") + (:file "mysql"))) (:module "pgsql" :serial t From 4fbfd9e5223855690f6b30b876d8e4eb658aeb8e Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 22 Aug 2018 10:52:01 +0200 Subject: [PATCH 09/69] Refrain from using regexp_match() function, introduced in Pg10. Instead use the substring() function which has been there all along. See #813. --- src/pgsql/sql/list-all-columns.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pgsql/sql/list-all-columns.sql b/src/pgsql/sql/list-all-columns.sql index 8875c4d..11be443 100644 --- a/src/pgsql/sql/list-all-columns.sql +++ b/src/pgsql/sql/list-all-columns.sql @@ -9,9 +9,9 @@ with seqattr as adnum, adsrc, case when adsrc ~~ 'nextval' - then (regexp_match(pg_get_expr(d.adbin, d.adrelid), - '''([^'']+)''') - )[1]::regclass::oid + then substring(pg_get_expr(d.adbin, d.adrelid) + from '''([^'']+)''' + )::regclass::oid else null::oid end as seqoid from pg_attrdef d From 0f58a3c84d3694fda01ba1fbf0ccc4f2ea205461 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 31 Aug 2018 22:51:41 -0700 Subject: [PATCH 10/69] Assorted fixes: catalogs SQLtypes and MySQL decoding as. It turns out that when trying to debug "decoding as" the SQLtype listing support in sqltype-list was found broken, so this patch fixes it. Then goes on to fix the DECODING AS filters support, which we have switched to using the better regexp-or-string filter struct but forgot to update the matching code accordingly. Fixes #665. --- src/sources/mysql/mysql.lisp | 8 +------- src/utils/catalog.lisp | 6 ++++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/sources/mysql/mysql.lisp b/src/sources/mysql/mysql.lisp index 1710c8c..7cc9555 100644 --- a/src/sources/mysql/mysql.lisp +++ b/src/sources/mysql/mysql.lisp @@ -235,13 +235,7 @@ Illegal ~a character starting at position ~a~@[: ~a~].~%" (defun apply-decoding-as-filters (table-name filters) "Return a generialized boolean which is non-nil only if TABLE-NAME matches one of the FILTERS." - (flet ((apply-filter (filter) - ;; we close over table-name here. - (typecase filter - (string (string-equal filter table-name)) - (list (destructuring-bind (type val) filter - (ecase type - (:regex (cl-ppcre:scan val table-name)))))))) + (flet ((apply-filter (filter) (matches filter table-name))) (some #'apply-filter filters))) (defmethod instanciate-table-copy-object ((copy copy-mysql) (table table)) diff --git a/src/utils/catalog.lisp b/src/utils/catalog.lisp index 76a4857..c61ce8f 100644 --- a/src/utils/catalog.lisp +++ b/src/utils/catalog.lisp @@ -204,7 +204,7 @@ (defmethod sqltype-list ((table table) &key) "Return the list of sqltypes for SCHEMA." - (apply #'append (mapcar #'sqltype-list (table-column-list table)))) + (mapcar #'sqltype-list (table-column-list table))) (defmethod sqltype-list ((schema schema) &key) "Return the list of sqltypes for SCHEMA." @@ -215,7 +215,9 @@ (defmethod sqltype-list ((catalog catalog) &key) "Return the list of sqltypes for CATALOG." (remove-duplicates - (apply #'append (mapcar #'sqltype-list (catalog-schema-list catalog))) + (remove-if #'null + (apply #'append + (mapcar #'sqltype-list (catalog-schema-list catalog)))) :test #'string-equal :key #'sqltype-name)) (defmethod table-list ((schema schema) &key) From 5119d864f4107f4d4d2e4e850fe4b44dc33a0bbc Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 4 Sep 2018 11:49:21 +0200 Subject: [PATCH 11/69] Assorted bug fixes in the context of Redshift support as a source. The catalog queries used in pgloader have to be adjusted for Redshift because this thing forked PostgreSQL 8.0, which is a long time ago now. Also, we had a couple bugs here and there that were not really related to Redshift support but were shown in that context. Fixes #813. --- src/parsers/command-pgsql.lisp | 6 ++++-- src/pgsql/sql/list-all-columns.sql | 8 ++++---- src/pgsql/sql/list-all-indexes.sql | 9 +++++---- src/sources/pgsql/pgsql-cast-rules.lisp | 23 ++++++++++++++++++++++- src/utils/transforms.lisp | 2 +- 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src/parsers/command-pgsql.lisp b/src/parsers/command-pgsql.lisp index 2a09fd7..3650534 100644 --- a/src/parsers/command-pgsql.lisp +++ b/src/parsers/command-pgsql.lisp @@ -45,7 +45,8 @@ (cons schema filter-list)))) (defrule including-matching-in-schema - (and including-in-schema (* including-in-schema)) + (and including-matching-in-schema-filter + (* including-matching-in-schema-filter)) (:lambda (source) (destructuring-bind (inc1 incs) source (cons :including (list* inc1 incs))))) @@ -58,7 +59,8 @@ (cons schema filter-list)))) (defrule excluding-matching-in-schema - (and excluding-in-schema (* excluding-in-schema)) + (and excluding-matching-in-schema-filter + (* excluding-matching-in-schema-filter)) (:lambda (source) (destructuring-bind (excl1 excls) source (cons :excluding (list* excl1 excls))))) diff --git a/src/pgsql/sql/list-all-columns.sql b/src/pgsql/sql/list-all-columns.sql index 11be443..75f8a52 100644 --- a/src/pgsql/sql/list-all-columns.sql +++ b/src/pgsql/sql/list-all-columns.sql @@ -11,9 +11,9 @@ with seqattr as case when adsrc ~~ 'nextval' then substring(pg_get_expr(d.adbin, d.adrelid) from '''([^'']+)''' - )::regclass::oid - else null::oid - end as seqoid + ) + else null + end as seqname from pg_attrdef d ) select nspname, relname, c.oid, attname, @@ -24,7 +24,7 @@ with seqattr as end as typmod, attnotnull, case when atthasdef then def.adsrc end as default, - case when s.seqoid is not null then 'auto_increment' end as extra + case when s.seqname is not null then 'auto_increment' end as extra from pg_class c join pg_namespace n on n.oid = c.relnamespace left join pg_attribute a on c.oid = a.attrelid diff --git a/src/pgsql/sql/list-all-indexes.sql b/src/pgsql/sql/list-all-indexes.sql index 320a6e0..bfffbf7 100644 --- a/src/pgsql/sql/list-all-indexes.sql +++ b/src/pgsql/sql/list-all-indexes.sql @@ -17,10 +17,11 @@ join pg_class r ON r.oid = x.indrelid join pg_namespace n ON n.oid = i.relnamespace join pg_namespace rn ON rn.oid = r.relnamespace - left join pg_constraint c ON c.conindid = i.oid - and c.conrelid = r.oid - -- filter out self-fkeys - and c.confrelid <> r.oid + left join pg_depend d on d.classid = 'pg_class'::regclass + and d.objid = i.oid + and d.refclassid = 'pg_constraint'::regclass + and d.deptype = 'i' + left join pg_constraint c ON c.oid = d.refobjid where n.nspname !~~ '^pg_' and n.nspname <> 'information_schema' ~:[~*~;and (~{~a~^~&~10t or ~})~] ~:[~*~;and (~{~a~^~&~10t and ~})~] diff --git a/src/sources/pgsql/pgsql-cast-rules.lisp b/src/sources/pgsql/pgsql-cast-rules.lisp index 2ef0373..6ac37ee 100644 --- a/src/sources/pgsql/pgsql-cast-rules.lisp +++ b/src/sources/pgsql/pgsql-cast-rules.lisp @@ -9,7 +9,10 @@ :target (:type "serial" :drop-default t)) (:source (:type "bigint" :auto-increment t) - :target (:type "bigserial" :drop-default t))) + :target (:type "bigserial" :drop-default t)) + + (:source (:type "character varying") + :target (:type "text" :drop-typemod t))) "Data Type Casting to migrate from PostgtreSQL to PostgreSQL") (defmethod pgsql-column-ctype ((column column)) @@ -45,4 +48,22 @@ (setf (column-transform-default pgcol) (column-transform-default field)) + ;; Redshift may be using DEFAULT getdate() instead of now() + (let ((default (column-default pgcol))) + (setf (column-default pgcol) + (cond + ((and (stringp default) (string= "NULL" default)) + :null) + + ((and (stringp default) + (or (string= "getdate()" default))) + :current-timestamp) + + (t (column-default pgcol)))) + + ;; we usually trust defaults that come from PostgreSQL... but we + ;; also have support for Redshift. + (when (member (column-default pgcol) '(:null :current-timestamp)) + (setf (column-transform-default pgcol) t))) + pgcol))) diff --git a/src/utils/transforms.lisp b/src/utils/transforms.lisp index dbc39b9..4d77c71 100644 --- a/src/utils/transforms.lisp +++ b/src/utils/transforms.lisp @@ -53,7 +53,7 @@ (string= "set" data-type)) (let ((start-1 (position #\( column-type)) ; just before start position (end (position #\) column-type))) ; just before end position - (when start-1 + (when (and start-1 (< (+ 1 start-1) end)) (destructuring-bind (a &optional b) (mapcar #'parse-integer (sq:split-sequence #\, column-type From d356bd501b557b41502a45ac7471e055112962f5 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 10 Sep 2018 19:33:39 +0200 Subject: [PATCH 12/69] Accept even more ragged date format input. When parsing a date string from a date format, accept that the ms or us part be completely missing, rather than just missing some digits. Fixed #828. --- src/parsers/date-format.lisp | 7 ++++--- test/csv-parse-date.load | 1 + test/regress/expected/csv-parse-date.out | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/parsers/date-format.lisp b/src/parsers/date-format.lisp index d7b7fc6..45ba4d7 100644 --- a/src/parsers/date-format.lisp +++ b/src/parsers/date-format.lisp @@ -38,11 +38,12 @@ :for ragged-end := (when end (cond ((member name '(:msecs :usecs)) ;; take any number of digits up to - ;; the specified field lenght + ;; the specified field length ;; (less digits are allowed) - (min end (length date-string))) + (when (<= start (length date-string)) + (min end (length date-string)))) (t end))) - :when (and start end) + :when (and start ragged-end) :append (list name (subseq date-string start ragged-end))) (if (or (string= year "0000") (string= month "00") diff --git a/test/csv-parse-date.load b/test/csv-parse-date.load index 318df8a..9d74e22 100644 --- a/test/csv-parse-date.load +++ b/test/csv-parse-date.load @@ -28,3 +28,4 @@ LOAD CSV 1,10-02-1999 00-33-12.123456,"00:05.02" 2,10-02-2014 00-33-13.123,"18:25.52" 3,10-02-2014 00-33-14.1234,13:14.15 +4,10-09-2018 19-24-59,19:24.59 diff --git a/test/regress/expected/csv-parse-date.out b/test/regress/expected/csv-parse-date.out index 4f4e941..d21c37a 100644 --- a/test/regress/expected/csv-parse-date.out +++ b/test/regress/expected/csv-parse-date.out @@ -1,3 +1,4 @@ 1 1999-10-02 00:33:12.123456+02 00:05:02 2 2014-10-02 00:33:13.123+02 18:25:52 3 2014-10-02 00:33:14.1234+02 13:14:15 +4 2018-10-09 19:24:59+02 19:24:59 From 0957bd0efa901fd4b352cc3f3349ac044d369ad9 Mon Sep 17 00:00:00 2001 From: Jon Snell Date: Fri, 5 Oct 2018 05:47:54 -0500 Subject: [PATCH 13/69] Fix pgloader bug #844 by adding support for mssql real types (#845) --- src/monkey/mssql.lisp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/monkey/mssql.lisp b/src/monkey/mssql.lisp index 9cd4e41..ce2abf6 100644 --- a/src/monkey/mssql.lisp +++ b/src/monkey/mssql.lisp @@ -93,6 +93,7 @@ (:syb-int2 (unsigned-to-signed (mem-ref data :unsigned-int) 2)) (:syb-int4 (unsigned-to-signed (mem-ref data :unsigned-int) 4)) (:syb-int8 (mem-ref data :int8)) + (:syb-real (mem-ref data :float)) (:syb-flt8 (mem-ref data :double)) ((:syb-datetime :syb-datetime4 :syb-msdate) (with-foreign-pointer (%buf +numeric-buf-sz+) From 344d0ca61b3f34b565cf60f719a33f4f99f01254 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 10 Oct 2018 11:08:28 -0700 Subject: [PATCH 14/69] Implement AFTER SCHEMA sql code blocks. This allows pgloader users to run SQL commands in between pgloader's schema creation and the actual loading of the data. --- src/load/migrate-database.lisp | 32 ++++++++++++++++++++---------- src/package.lisp | 1 + src/parsers/command-pgsql.lisp | 9 ++++++--- src/parsers/command-sql-block.lisp | 31 +++++++++++++++++++---------- 4 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index 044d931..28f57c9 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -255,6 +255,7 @@ (reset-sequences t) (foreign-keys t) (reindex nil) + (after-schema nil) only-tables including excluding @@ -329,17 +330,26 @@ ;; if asked, first drop/create the tables on the PostgreSQL side (handler-case - (prepare-pgsql-database copy - catalog - :truncate truncate - :create-tables create-tables - :create-schemas create-schemas - :drop-indexes drop-indexes - :drop-schema drop-schema - :include-drop include-drop - :foreign-keys foreign-keys - :set-table-oids set-table-oids - :materialize-views materialize-views) + (progn + (prepare-pgsql-database copy + catalog + :truncate truncate + :create-tables create-tables + :create-schemas create-schemas + :drop-indexes drop-indexes + :drop-schema drop-schema + :include-drop include-drop + :foreign-keys foreign-keys + :set-table-oids set-table-oids + :materialize-views materialize-views) + + ;; if there's an AFTER SCHEMA DO/EXECUTE command, now is the time + ;; to run it. + (when after-schema + (pgloader.parser::execute-sql-code-block (target-db copy) + :pre + after-schema + "after schema"))) ;; ;; In case some error happens in the preparatory transaction, we ;; need to stop now and refrain from trying to load the data into diff --git a/src/package.lisp b/src/package.lisp index 8e81cdf..bc9abfe 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -812,6 +812,7 @@ (:export #:parse-commands #:parse-commands-from-file #:initialize-context + #:execute-sql-code-block ;; tools to enable complete cli parsing in main.lisp #:process-relative-pathnames diff --git a/src/parsers/command-pgsql.lisp b/src/parsers/command-pgsql.lisp index 3650534..6599c4f 100644 --- a/src/parsers/command-pgsql.lisp +++ b/src/parsers/command-pgsql.lisp @@ -79,6 +79,7 @@ excluding-matching-in-schema decoding-tables-as before-load + after-schema after-load)) (:lambda (clauses-list) (alexandria:alist-plist clauses-list))) @@ -103,11 +104,11 @@ (defun lisp-code-for-loading-from-pgsql (pg-src-db-conn pg-dst-db-conn &key gucs - casts before after options + casts options + before after after-schema alter-table alter-schema ((:including incl)) ((:excluding excl)) - ((:decoding decoding-as)) &allow-other-keys) `(lambda () (let* ((*default-cast-rules* ',*pgsql-default-cast-rules*) @@ -131,6 +132,7 @@ :index-names :preserve :set-table-oids t :on-error-stop on-error-stop + :after-schema ',after-schema ,@(remove-batch-control-option options)) ,(sql-code-block pg-dst-db-conn :post after "after load")))) @@ -140,7 +142,7 @@ (destructuring-bind (pg-src-db-uri pg-dst-db-uri &key - gucs casts before after options + gucs casts before after after-schema options alter-table alter-schema including excluding decoding) source @@ -152,6 +154,7 @@ :casts casts :before before :after after + :after-schema after-schema :options options :alter-table alter-table :alter-schema alter-schema diff --git a/src/parsers/command-sql-block.lisp b/src/parsers/command-sql-block.lisp index dba0a4b..e99bd07 100644 --- a/src/parsers/command-sql-block.lisp +++ b/src/parsers/command-sql-block.lisp @@ -58,17 +58,26 @@ (bind (((_ _ sql-list-of-list) after)) (cons :after (apply #'append sql-list-of-list))))) +(defrule after-schema (and kw-after kw-create kw-schema + (+ (or load-do load-execute))) + (:lambda (after) + (bind (((_ _ _ sql-list-of-list) after)) + (cons :after-schema (apply #'append sql-list-of-list))))) + (defun sql-code-block (pgconn section commands label) "Return lisp code to run COMMANDS against DBNAME, updating STATE." (when commands - `(with-stats-collection (,label - :dbname ,(db-name pgconn) - :section ,section - :use-result-as-read t - :use-result-as-rows t) - (log-message :notice "Executing SQL block for ~a" ,label) - (with-pgsql-transaction (:pgconn ,pgconn) - (loop for command in ',commands - do - (pgsql-execute command :client-min-messages :error) - counting command))))) + `(execute-sql-code-block ,pgconn ,section ',commands ,label))) + +(defun execute-sql-code-block (pgconn section commands label) + "Exceute given SQL commands." + (with-stats-collection (label + :dbname (db-name pgconn) + :section section + :use-result-as-read t + :use-result-as-rows t) + (log-message :notice "Executing SQL block for ~a" label) + (with-pgsql-transaction (:pgconn pgconn) + (loop :for command :in commands + :do (pgsql-execute command :client-min-messages :error) + :counting command)))) From 381ac9d1a2378fda9317fdbae319e7cc642d3a79 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 10 Oct 2018 14:15:28 -0700 Subject: [PATCH 15/69] Add initial support for Citus distribution from pgloader. The idea is for pgloader to tweak the schema from a description of the sharding model, the distribute clause. Here's an example of such a clause: distribute company using id distribute campaign using company_id distribute ads using company_id from campaign distribute clicks using company_id from ads, campaign Given such commands, pgloader adds the distibution key to the table when needed, to the primary key definition of the table, and also to the foreign keys that are pointing to the changed primary key. Then when SELECTing the data from the source database, the idea is for pgloader to automatically JOIN the base table with the source table where to find the distribution key, in case it was just added in the schema. Finally, pgloader also calls the following Citus commands: SELECT create_distributed_table('company', 'id'); SELECT create_distributed_table('campaign', 'company_id'); SELECT create_distributed_table('ads', 'company_id'); SELECT create_distributed_table('clicks', 'company_id'); --- pgloader.asd | 3 + src/load/migrate-database.lisp | 29 ++++++++-- src/package.lisp | 10 ++++ src/parsers/command-distribute.lisp | 48 ++++++++++++++++ src/parsers/command-keywords.lisp | 3 + src/parsers/command-pgsql.lisp | 8 ++- src/pgsql/pgsql-ddl-citus.lisp | 18 ++++++ src/pgsql/pgsql-schema.lisp | 4 +- src/pgsql/sql/list-all-indexes.sql | 5 ++ src/utils/catalog.lisp | 3 +- src/utils/citus.lisp | 89 +++++++++++++++++++++++++++++ 11 files changed, 209 insertions(+), 11 deletions(-) create mode 100644 src/parsers/command-distribute.lisp create mode 100644 src/pgsql/pgsql-ddl-citus.lisp create mode 100644 src/utils/citus.lisp diff --git a/pgloader.asd b/pgloader.asd index 12b1684..3d12ebd 100644 --- a/pgloader.asd +++ b/pgloader.asd @@ -69,6 +69,7 @@ (:file "quoting" :depends-on ("utils")) (:file "catalog" :depends-on ("quoting")) (:file "alter-table" :depends-on ("catalog")) + (:file "citus" :depends-on ("catalog")) ;; State, monitoring, reporting (:file "reject" :depends-on ("state")) @@ -95,6 +96,7 @@ :components ((:file "connection") (:file "pgsql-ddl") + (:file "pgsql-ddl-citus") (:file "pgsql-schema") (:file "merge-catalogs" :depends-on ("pgsql-schema")) (:file "pgsql-trigger") @@ -239,6 +241,7 @@ (:file "command-cast-rules") (:file "command-materialize-views") (:file "command-alter-table") + (:file "command-distribute") (:file "command-mysql") (:file "command-including-like") (:file "command-mssql") diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index 28f57c9..129ca5b 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -19,7 +19,8 @@ set-table-oids materialize-views foreign-keys - include-drop) + include-drop + distribute) "Prepare the target PostgreSQL database: create tables casting datatypes from the MySQL definitions, prepare index definitions and create target tables for materialized views. @@ -114,7 +115,15 @@ :use-result-as-rows t) (create-views catalog :include-drop include-drop - :client-min-messages :error)))) + :client-min-messages :error))) + + ;; Citus Support + (when distribute + (with-stats-collection ("Citus Distribute Tables" :section :pre) + (let ((citus-sql + (loop :for rule :in distribute + :collect (format-create-sql rule)))) + (pgsql-execute citus-sql :client-min-messages :notice))))) ;; log the catalog we just fetched and (maybe) merged (log-message :data "CATALOG: ~s" catalog)) @@ -213,9 +222,10 @@ :reset-sequences reset-sequences)))) -(defun process-catalog (copy catalog &key alter-table alter-schema) +(defun process-catalog (copy catalog &key alter-table alter-schema distribute) "Do all the PostgreSQL catalog tweaking here: casts, index WHERE clause rewriting, pgloader level alter schema and alter table commands." + ;; cast the catalog into something PostgreSQL can work on (cast catalog) @@ -229,7 +239,11 @@ ;; if asked, now alter the catalog with given rules: the alter-table ;; keyword parameter actually contains a set of alter table rules. (when alter-table - (alter-table catalog alter-table))) + (alter-table catalog alter-table)) + + ;; we also support schema changes necessary for Citus distribution + (when distribute + (pgloader.catalog::citus-distribute-schema catalog distribute))) ;;; @@ -256,6 +270,7 @@ (foreign-keys t) (reindex nil) (after-schema nil) + distribute only-tables including excluding @@ -326,7 +341,8 @@ ;; that's CAST rules, index WHERE clause rewriting and ALTER commands (process-catalog copy catalog :alter-table alter-table - :alter-schema alter-schema) + :alter-schema alter-schema + :distribute distribute) ;; if asked, first drop/create the tables on the PostgreSQL side (handler-case @@ -341,7 +357,8 @@ :include-drop include-drop :foreign-keys foreign-keys :set-table-oids set-table-oids - :materialize-views materialize-views) + :materialize-views materialize-views + :distribute distribute) ;; if there's an AFTER SCHEMA DO/EXECUTE command, now is the time ;; to run it. diff --git a/src/package.lisp b/src/package.lisp index bc9abfe..8d32d64 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -98,6 +98,7 @@ #:table-index-list #:table-fkey-list #:table-trigger-list + #:table-citus-rule #:extension-name #:extension-schema @@ -208,6 +209,15 @@ #:match-rule-action #:match-rule-args + #:citus-reference-table + #:citus-distributed-table + #:make-citus-reference-table + #:make-citus-distributed-table + #:citus-reference-table-table + #:citus-distributed-table-table + #:citus-distributed-table-using + #:citus-distributed-table-from + #:format-table-name)) (defpackage #:pgloader.state diff --git a/src/parsers/command-distribute.lisp b/src/parsers/command-distribute.lisp new file mode 100644 index 0000000..0a642b7 --- /dev/null +++ b/src/parsers/command-distribute.lisp @@ -0,0 +1,48 @@ +#| + distribute billers using id + distribute bills using biller_id + distribute receivable_accounts using biller_id + distribute payments using biller_id + + distribute splits using biller_id + from receivable_accounts + + distribute ach_accounts as reference table +|# + +(in-package :pgloader.parser) + +(defun create-table-from-dsn-table-name (dsn-table-name + &optional (schema-name "public")) + (let ((table (create-table (cdr (second dsn-table-name))))) + (unless (table-schema table) + (setf (table-schema table) + (make-schema :catalog nil + :source-name schema-name + :name (apply-identifier-case schema-name)))) + table)) + +(defrule distribute-reference (and kw-distribute dsn-table-name + kw-as kw-reference kw-table) + (:lambda (d-r) + (make-citus-reference-table :table (create-table-from-dsn-table-name d-r)))) + +(defrule distribute-using (and kw-distribute dsn-table-name + kw-using maybe-quoted-namestring) + (:lambda (d-u) + (make-citus-distributed-table :table (create-table-from-dsn-table-name d-u) + :using (make-column :name (fourth d-u))))) + +(defrule distribute-using-from (and kw-distribute dsn-table-name + kw-using maybe-quoted-namestring + kw-from (+ maybe-quoted-namestring)) + (:lambda (d-u-f) + (make-citus-distributed-table :table (create-table-from-dsn-table-name d-u-f) + :using (make-column :name (fourth d-u-f)) + :from (apply #'create-table (sixth d-u-f))))) + +(defrule distribute-commands (+ (or distribute-using-from + distribute-using + distribute-reference)) + (:lambda (commands) + (cons :distribute commands))) diff --git a/src/parsers/command-keywords.lisp b/src/parsers/command-keywords.lisp index a2454cd..9a4dcea 100644 --- a/src/parsers/command-keywords.lisp +++ b/src/parsers/command-keywords.lisp @@ -103,6 +103,9 @@ (def-keyword-rule "trim") (def-keyword-rule "unquoted") (def-keyword-rule "delimiter") + ;; option for Citus support + (def-keyword-rule "distribute") + (def-keyword-rule "reference") ;; option for MySQL imports (def-keyword-rule "schema") (def-keyword-rule "schemas") diff --git a/src/parsers/command-pgsql.lisp b/src/parsers/command-pgsql.lisp index 6599c4f..f5f7996 100644 --- a/src/parsers/command-pgsql.lisp +++ b/src/parsers/command-pgsql.lisp @@ -80,7 +80,8 @@ decoding-tables-as before-load after-schema - after-load)) + after-load + distribute-commands)) (:lambda (clauses-list) (alexandria:alist-plist clauses-list))) @@ -109,6 +110,7 @@ alter-table alter-schema ((:including incl)) ((:excluding excl)) + distribute &allow-other-keys) `(lambda () (let* ((*default-cast-rules* ',*pgsql-default-cast-rules*) @@ -133,6 +135,7 @@ :set-table-oids t :on-error-stop on-error-stop :after-schema ',after-schema + :distribute ',distribute ,@(remove-batch-control-option options)) ,(sql-code-block pg-dst-db-conn :post after "after load")))) @@ -143,7 +146,7 @@ pg-dst-db-uri &key gucs casts before after after-schema options - alter-table alter-schema + alter-table alter-schema distribute including excluding decoding) source (cond (*dry-run* @@ -158,6 +161,7 @@ :options options :alter-table alter-table :alter-schema alter-schema + :distribute distribute :including including :excluding excluding :decoding decoding)))))) diff --git a/src/pgsql/pgsql-ddl-citus.lisp b/src/pgsql/pgsql-ddl-citus.lisp new file mode 100644 index 0000000..f74ade5 --- /dev/null +++ b/src/pgsql/pgsql-ddl-citus.lisp @@ -0,0 +1,18 @@ +;;; +;;; PostgreSQL Citus support for calling functions. +;;; + +(in-package :pgloader.pgsql) + +(defmethod format-create-sql ((rule citus-reference-table) + &key (stream nil) if-not-exists) + (declare (ignore if-not-exists)) + (format stream "SELECT create_reference_table('~a');" + (format-table-name (citus-reference-table-table rule)))) + +(defmethod format-create-sql ((rule citus-distributed-table) + &key (stream nil) if-not-exists) + (declare (ignore if-not-exists)) + (format stream "SELECT create_distributed_table('~a', '~a');" + (format-table-name (citus-distributed-table-table rule)) + (column-name (citus-distributed-table-using rule)))) diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index 9ea3d59..72da2ac 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -198,7 +198,7 @@ (loop :for (schema-name name oid table-schema table-name - primary unique sql conname condef) + primary unique cols sql conname condef) :in (query nil (format nil (sql "/pgsql/list-all-indexes.sql") @@ -222,7 +222,7 @@ :table table :primary primary :unique unique - :columns nil + :columns (split-sequence:split-sequence #\, cols) :sql sql :conname (unless (eq :null conname) (ensure-quoted conname)) diff --git a/src/pgsql/sql/list-all-indexes.sql b/src/pgsql/sql/list-all-indexes.sql index bfffbf7..1f655fa 100644 --- a/src/pgsql/sql/list-all-indexes.sql +++ b/src/pgsql/sql/list-all-indexes.sql @@ -9,6 +9,11 @@ r.relname, indisprimary, indisunique, + (select string_agg(attname, ',') + from pg_attribute + where attrelid = r.oid + and array[attnum::integer] <@ indkey::integer[] + ) as cols, pg_get_indexdef(indexrelid), c.conname, pg_get_constraintdef(c.oid) diff --git a/src/utils/catalog.lisp b/src/utils/catalog.lisp index c61ce8f..46ddbc6 100644 --- a/src/utils/catalog.lisp +++ b/src/utils/catalog.lisp @@ -50,7 +50,8 @@ (defstruct table source-name name schema oid comment storage-parameter-list ;; field is for SOURCE ;; column is for TARGET - field-list column-list index-list fkey-list trigger-list) + ;; citus is an extra slot for citus support + field-list column-list index-list fkey-list trigger-list citus-rule) ;;; ;;; When migrating from PostgreSQL to PostgreSQL we might have to install diff --git a/src/utils/citus.lisp b/src/utils/citus.lisp new file mode 100644 index 0000000..b080afb --- /dev/null +++ b/src/utils/citus.lisp @@ -0,0 +1,89 @@ +;;; +;;; Citus support in pgloader allows to declare what needs to change in the +;;; source schema in terms of Citus concepts: reference and distributed +;;; table. +;;; + +#| + distribute billers using id + distribute bills using biller_id + distribute receivable_accounts using biller_id + distribute payments using biller_id + + distribute splits using biller_id + from receivable_accounts + + distribute ach_accounts as reference table +|# + + +(in-package #:pgloader.catalog) + +(defstruct citus-reference-table table) +(defstruct citus-distributed-table table using from) + +(defun citus-distribute-schema (catalog distribution-rules) + "Distribute a CATALOG with given user provided DISTRIBUTION-RULES." + (loop :for rule :in distribution-rules + :do (let ((table (citus-find-table catalog (citus-rule-table rule)))) + (apply-citus-rule rule table)))) + +(defun citus-rule-table (rule) + (etypecase rule + (citus-reference-table (citus-reference-table-table rule)) + (citus-distributed-table (citus-distributed-table-table rule)))) + +(defun citus-find-table (catalog table) + (let* ((table-name (table-name table)) + (schema-name (schema-name (table-schema table)))) + (find-table (find-schema catalog schema-name) table-name))) + +(defgeneric apply-citus-rule (rule table) + (:documentation "Apply a Citus distribution RULE to given TABLE.")) + +(defmethod apply-citus-rule ((rule citus-reference-table) (table table)) + ;; for a reference table, we have nothing to do really. + (setf (table-citus-rule table) rule)) + +(defmethod apply-citus-rule ((rule citus-distributed-table) (table table)) + (setf (table-citus-rule table) rule) + + ;; ok now we need to check if the USING column exists or if we need to add + ;; it to our model + (let ((column (find (column-name (citus-distributed-table-using rule)) + (table-field-list table) + :test #'string= + :key #'column-name))) + (assert (not (null column))) + + (if column + + ;; add it to the PKEY definition, in first position + (let* ((index (find-if #'index-primary (table-index-list table))) + (idxcol (find (column-name (citus-distributed-table-using rule)) + (index-columns index) + :test #'string=))) + (assert (not (null index))) + (unless idxcol + ;; add a new column + (push (column-name (citus-distributed-table-using rule)) + (index-columns index)) + ;; now remove origin schema sql and condef, we need to redo them + (setf (index-sql index) nil) + (setf (index-condef index) nil))) + + ;; the column doesn't exist, we need to find it in the :FROM rule + (let* ((from-table + (citus-find-table (schema-catalog (table-schema table)) + (citus-distributed-table-from rule))) + (column-definition + (find (column-name (citus-distributed-table-using rule)) + (table-field-list from-table) + :test #'string= + :key #'column-name))) + (assert (not (null from-table))) + (push (make-column :name (column-name column-definition) + :type-name (column-type-name column-definition) + :nullable (column-nullable column-definition) + :transform (column-transform column-definition)) + (table-column-list table)))))) From 760763be4bb4cc7b45130727a2c303a79943c112 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 10 Oct 2018 15:44:21 -0700 Subject: [PATCH 16/69] Use the constraint name when we have it. That's important for Citus, which doesn't know how to ADD a constraint without a name. --- src/pgsql/pgsql-ddl.lisp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pgsql/pgsql-ddl.lisp b/src/pgsql/pgsql-ddl.lisp index 580618e..662bc95 100644 --- a/src/pgsql/pgsql-ddl.lisp +++ b/src/pgsql/pgsql-ddl.lisp @@ -204,8 +204,9 @@ ;; don't use the index schema name here, PostgreSQL doesn't ;; like it, might be implicit from the table's schema ;; itself... - "ALTER TABLE ~a ADD ~a USING INDEX ~a;" + "ALTER TABLE ~a ADD~@[ CONSTRAINT ~a~] ~a USING INDEX ~a;" (format-table-name table) + (index-conname index) (cond ((index-primary index) "PRIMARY KEY") ((index-unique index) "UNIQUE")) index-name))) From 8112a9b54fc8124ec849324803ebfdb67c1eda2d Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 16 Oct 2018 18:53:41 +0200 Subject: [PATCH 17/69] Improve Citus Distribution Support. With this patch it's now actually possible to backfill the data on the fly when using the "distribute" new commands. The schema is modified to add the distribution key where specified, and changes to the primary and foreign keys happen automatically. Then a JOIN is generated to get the data directly during the COPY streaming to the Citus cluster. --- src/load/migrate-database.lisp | 20 ++-- src/package.lisp | 4 + src/parsers/command-distribute.lisp | 29 ++++- src/pgsql/pgsql-create-schema.lisp | 11 ++ src/pgsql/pgsql-schema.lisp | 13 ++- src/pgsql/sql/list-all-fkeys.sql | 7 +- src/sources/pgsql/pgsql.lisp | 30 ++++-- src/utils/catalog.lisp | 2 +- src/utils/citus.lisp | 160 +++++++++++++++++++++++----- 9 files changed, 234 insertions(+), 42 deletions(-) diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index 129ca5b..d99efbf 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -115,15 +115,21 @@ :use-result-as-rows t) (create-views catalog :include-drop include-drop - :client-min-messages :error))) + :client-min-messages :error)))) - ;; Citus Support - (when distribute + ;; Citus Support + ;; + ;; We need a separate transaction here in some cases, because of the + ;; distributed DDL support from Citus, to avoid the following error: + ;; + ;; ERROR Database error 25001: cannot establish a new connection for + ;; placement 2299, since DDL has been executed on a connection that is in + ;; use + ;; + (when distribute + (with-pgsql-transaction (:pgconn (target-db copy)) (with-stats-collection ("Citus Distribute Tables" :section :pre) - (let ((citus-sql - (loop :for rule :in distribute - :collect (format-create-sql rule)))) - (pgsql-execute citus-sql :client-min-messages :notice))))) + (create-distributed-table distribute)))) ;; log the catalog we just fetched and (maybe) merged (log-message :data "CATALOG: ~s" catalog)) diff --git a/src/package.lisp b/src/package.lisp index 8d32d64..e1e74bf 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -217,6 +217,8 @@ #:citus-distributed-table-table #:citus-distributed-table-using #:citus-distributed-table-from + #:citus-format-sql-select + #:citus-backfill-table-p #:format-table-name)) @@ -433,6 +435,8 @@ #:reset-sequences #:comment-on-tables-and-columns + #:create-distributed-table + ;; finalizing catalogs support (redshift and other variants) #:finalize-catalogs #:adjust-data-types diff --git a/src/parsers/command-distribute.lisp b/src/parsers/command-distribute.lisp index 0a642b7..6ae0b66 100644 --- a/src/parsers/command-distribute.lisp +++ b/src/parsers/command-distribute.lisp @@ -33,13 +33,38 @@ (make-citus-distributed-table :table (create-table-from-dsn-table-name d-u) :using (make-column :name (fourth d-u))))) +;;; +;;; The namestring rule allows for commas and we use them as a separator +;;; here, so we need to have our own table name parsing. That's a bummer, +;;; maybe we should revisit the whole table names parsing code? +;;; +(defrule distribute-from-tablename + (or double-quoted-namestring + quoted-namestring + (and (or #\_ (alpha-char-p character)) + (* (or (alpha-char-p character) + (digit-char-p character))))) + (:text t)) + +(defrule maybe-qualified-dist-from-table-name + (and distribute-from-tablename (? (and "." distribute-from-tablename))) + (:lambda (name) + (if (second name) + (cons (first name) (second (second name))) + (cons "public" (first name))))) + +(defrule distribute-from-list (+ (and maybe-qualified-dist-from-table-name + (? (and "," ignore-whitespace)))) + (:lambda (from-list) + (mapcar #'first from-list))) + (defrule distribute-using-from (and kw-distribute dsn-table-name kw-using maybe-quoted-namestring - kw-from (+ maybe-quoted-namestring)) + kw-from distribute-from-list) (:lambda (d-u-f) (make-citus-distributed-table :table (create-table-from-dsn-table-name d-u-f) :using (make-column :name (fourth d-u-f)) - :from (apply #'create-table (sixth d-u-f))))) + :from (mapcar #'create-table (sixth d-u-f))))) (defrule distribute-commands (+ (or distribute-using-from distribute-using diff --git a/src/pgsql/pgsql-create-schema.lisp b/src/pgsql/pgsql-create-schema.lisp index e6154e5..b06c31d 100644 --- a/src/pgsql/pgsql-create-schema.lisp +++ b/src/pgsql/pgsql-create-schema.lisp @@ -465,3 +465,14 @@ $$; " tables))) (column-name column) quote (column-comment column) quote))))) (pgsql-execute-with-timing section label sql-list))) + + + +;;; +;;; Citus Disitribution support +;;; +(defun create-distributed-table (distribute-rules) + (let ((citus-sql + (loop :for rule :in distribute-rules + :collect (format-create-sql rule)))) + (pgsql-execute citus-sql))) diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index 72da2ac..59f33f0 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -235,7 +235,7 @@ "Get the list of PostgreSQL index definitions per table." (loop :for (schema-name table-name fschema-name ftable-name - conoid conname condef + conoid pkeyoid conname condef cols fcols updrule delrule mrule deferrable deferred) :in (query nil @@ -277,9 +277,13 @@ (table (find-table schema table-name)) (fschema (find-schema catalog fschema-name)) (ftable (find-table fschema ftable-name)) + (pkey (find pkeyoid (table-index-list ftable) + :test #'= + :key #'index-oid)) (fk (make-fkey :name (ensure-quoted conname) :oid conoid + :pkey pkey :condef condef :table table :columns (split-sequence:split-sequence #\, cols) @@ -290,6 +294,13 @@ :match-rule (pg-fk-match-rule-to-match-clause mrule) :deferrable deferrable :initially-deferred deferred))) + ;; add the fkey reference to the pkey index too + (unless (find conoid + (index-fk-deps pkey) + :test #'= + :key #'fkey-oid) + (push-to-end fk (index-fk-deps pkey))) + ;; check that both tables are in pgloader's scope (if (and table ftable) (add-fkey table fk) (log-message :notice "Foreign Key ~a is ignored, one of its table is missing from pgloader table selection" diff --git a/src/pgsql/sql/list-all-fkeys.sql b/src/pgsql/sql/list-all-fkeys.sql index 8ebe8b5..bc666d1 100644 --- a/src/pgsql/sql/list-all-fkeys.sql +++ b/src/pgsql/sql/list-all-fkeys.sql @@ -7,7 +7,9 @@ -- excluding (ftable) -- filter-list-to-where-clause for excluding select n.nspname, c.relname, nf.nspname, cf.relname as frelname, - r.oid, conname, + r.oid, + d.refobjid as pkeyoid, + conname, pg_catalog.pg_get_constraintdef(r.oid, true) as condef, (select string_agg(attname, ',') from pg_attribute @@ -26,6 +28,9 @@ JOIN pg_namespace n on c.relnamespace = n.oid JOIN pg_class cf on r.confrelid = cf.oid JOIN pg_namespace nf on cf.relnamespace = nf.oid + JOIN pg_depend d on d.classid = 'pg_constraint'::regclass + and d.objid = r.oid + and d.refobjsubid = 0 where r.contype = 'f' AND c.relkind in ('r', 'f', 'p') AND cf.relkind in ('r', 'f', 'p') diff --git a/src/sources/pgsql/pgsql.lisp b/src/sources/pgsql/pgsql.lisp index 8a45a58..da6d611 100644 --- a/src/sources/pgsql/pgsql.lisp +++ b/src/sources/pgsql/pgsql.lisp @@ -41,12 +41,30 @@ (funcall process-row-fn row))))))) (with-pgsql-connection ((source-db pgsql)) - (let* ((cols (mapcar #'column-name (fields pgsql))) - (sql - (format nil "SELECT ~{~s::text~^, ~} FROM ~s.~s" cols - (schema-source-name (table-schema (source pgsql))) - (table-source-name (source pgsql))))) - (cl-postgres:exec-query pomo:*database* sql map-reader))))) + (if (citus-backfill-table-p (target pgsql)) + ;; + ;; SELECT dist_key, * FROM source JOIN dist ON ... + ;; + (let ((sql (citus-format-sql-select (source pgsql) (target pgsql)))) + (log-message :sql "~a" sql) + (cl-postgres:exec-query pomo:*database* sql map-reader)) + + ;; + ;; No JOIN to add to backfill data in the SQL query here. + ;; + (let* ((cols (mapcar #'column-name (fields pgsql))) + (sql + (format nil + "SELECT ~{~s::text~^, ~} FROM ~s.~s" + cols + (schema-source-name (table-schema (source pgsql))) + (table-source-name (source pgsql))))) + (log-message :sql "~a" sql) + (cl-postgres:exec-query pomo:*database* sql map-reader)))))) + +(defmethod copy-column-list ((pgsql copy-pgsql)) + "We are sending the data in the MySQL columns ordering here." + (mapcar #'column-name (fields pgsql))) (defmethod fetch-metadata ((pgsql copy-pgsql) (catalog catalog) diff --git a/src/utils/catalog.lisp b/src/utils/catalog.lisp index 46ddbc6..6b29aad 100644 --- a/src/utils/catalog.lisp +++ b/src/utils/catalog.lisp @@ -78,7 +78,7 @@ ;;; Index and Foreign Keys ;;; (defstruct fkey - name oid table columns foreign-table foreign-columns condef + name oid table columns pkey foreign-table foreign-columns condef update-rule delete-rule match-rule deferrable initially-deferred) ;;; diff --git a/src/utils/citus.lisp b/src/utils/citus.lisp index b080afb..0cdcc9b 100644 --- a/src/utils/citus.lisp +++ b/src/utils/citus.lisp @@ -48,42 +48,154 @@ (defmethod apply-citus-rule ((rule citus-distributed-table) (table table)) (setf (table-citus-rule table) rule) + ;; + ;; Replace the TABLE placeholders in the :FROM slot of the rule with the + ;; tables from the catalogs. + ;; + (when (citus-distributed-table-from rule) + (let ((catalog (schema-catalog (table-schema table)))) + (map-into (citus-distributed-table-from rule) + (lambda (from) (citus-find-table catalog from)) + (citus-distributed-table-from rule)))) + ;; ok now we need to check if the USING column exists or if we need to add ;; it to our model (let ((column (find (column-name (citus-distributed-table-using rule)) (table-field-list table) :test #'string= :key #'column-name))) - (assert (not (null column))) - (if column ;; add it to the PKEY definition, in first position - (let* ((index (find-if #'index-primary (table-index-list table))) - (idxcol (find (column-name (citus-distributed-table-using rule)) - (index-columns index) - :test #'string=))) - (assert (not (null index))) - (unless idxcol - ;; add a new column - (push (column-name (citus-distributed-table-using rule)) - (index-columns index)) - ;; now remove origin schema sql and condef, we need to redo them - (setf (index-sql index) nil) - (setf (index-condef index) nil))) + (add-column-to-pkey table + (column-name (citus-distributed-table-using rule))) - ;; the column doesn't exist, we need to find it in the :FROM rule - (let* ((from-table - (citus-find-table (schema-catalog (table-schema table)) - (citus-distributed-table-from rule))) + ;; The column doesn't exist, we need to find it in the :FROM rule's + ;; list. The :FROM slot of the rule is a list of tables to + ;; "traverse" when backfilling the data. The list follows the + ;; foreign-key relationships from TABLE to the source of the + ;; distribution key. + ;; + ;; To find the column definition to add to the current TABLE, look + ;; it up in the last entry of the FROM rule's list. + (let* ((last-from-rule (car (last (citus-distributed-table-from rule)))) (column-definition (find (column-name (citus-distributed-table-using rule)) - (table-field-list from-table) + (table-field-list last-from-rule) :test #'string= - :key #'column-name))) - (assert (not (null from-table))) - (push (make-column :name (column-name column-definition) + :key #'column-name)) + (new-column + (make-column :name (column-name column-definition) :type-name (column-type-name column-definition) :nullable (column-nullable column-definition) - :transform (column-transform column-definition)) - (table-column-list table)))))) + :transform (column-transform column-definition)))) + ;; + ;; Here also we need to add the new column to the PKEY definition, + ;; in first position. + ;; + (add-column-to-pkey table (column-name new-column)) + + ;; + ;; We need to backfill the distribution key in the data, which + ;; we're implementing with a JOIN when we SELECT from the source + ;; table. We add the new field here. + ;; + (push new-column (table-field-list table)) + (push new-column (table-column-list table)))))) + + +(defun add-column-to-pkey (table column-name) + "Add COLUMN in the first position of the TABLE's primary key index." + (let* ((index (find-if #'index-primary (table-index-list table))) + (idxcol (find column-name (index-columns index) :test #'string=))) + (assert (not (null index))) + (unless idxcol + ;; add a new column + (push column-name (index-columns index)) + ;; now remove origin schema sql and condef, we need to redo them + (setf (index-sql index) nil) + (setf (index-condef index) nil) + + ;; now tweak the fkey definitions that are using this index + (loop :for fkey :in (index-fk-deps index) + :do (push column-name (fkey-columns fkey)) + :do (push column-name (fkey-foreign-columns fkey)) + :do (setf (fkey-condef fkey) nil))))) + + +(defun format-citus-join-clause (table distribution-rule) + "Format a JOIN clause to backfill the distribution key data in tables that + are referencing (even indirectly) the main distribution table." + (with-output-to-string (s) + (loop :for current-table := table :then rel + :for rel :in (citus-distributed-table-from distribution-rule) + :do (let* ((fkey + (find (ensure-unquoted (table-name rel)) + (table-fkey-list current-table) + :test #'string= + :key (lambda (fkey) + (ensure-unquoted + (table-name (fkey-foreign-table fkey)))))) + (ftable (fkey-foreign-table fkey))) + (format s + " JOIN ~s.~s" + (schema-source-name (table-schema ftable)) + (table-source-name ftable)) + ;; + ;; Skip the first column in the fkey definition, that's the + ;; distribution key that was just added by pgloader: we don't + ;; have it on the source database, we are going to create it on + ;; the target database. + ;; + (loop :for first := t :then nil + :for c :in (cdr (fkey-columns fkey)) + :for fc :in (cdr (fkey-foreign-columns fkey)) + :do (format s + " ~:[AND~;ON~] ~a.~a = ~a.~a" + first + (table-source-name (fkey-table fkey)) + c + (table-source-name (fkey-foreign-table fkey)) + fc)))))) + +(defun citus-format-sql-select (source-table target-table) + "Return the SQL statement to use to fetch data from the COPY context, + including backfilling the distribution key in related tables." + + ;; + ;; SELECT from.id, id, ... from source join from-table ... + ;; + ;; So we must be careful to prefix the column names with the + ;; proper table name, because of the join(s), and the first column + ;; in the output is taken from the main FROM table (the last one + ;; in the rule). + ;; + (let* ((last-from-rule + (car (last (citus-distributed-table-from + (table-citus-rule target-table))))) + (cols + (append (list + (format nil "~a.~a" + (table-name last-from-rule) + (column-name (first (table-field-list source-table))))) + (mapcar (lambda (field) + (format nil "~a.~a" + (table-name source-table) + (column-name field))) + (rest (table-field-list source-table))))) + (joins + (format-citus-join-clause source-table + (table-citus-rule target-table)))) + (format nil + "SELECT ~{~a::text~^, ~} FROM ~s.~s ~a" + cols + (schema-source-name (table-schema source-table)) + (table-source-name source-table) + joins))) + +(defun citus-backfill-table-p (table) + "Returns non-nil when given TABLE should be backfilled with the + distribution key." + (and (table-citus-rule table) + (typep (table-citus-rule table) 'citus-distributed-table) + (not (null (citus-distributed-table-from (table-citus-rule table)))))) From d3b21ac54d090f5d8293b4332309775e820c42b1 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 18 Oct 2018 15:31:29 +0200 Subject: [PATCH 18/69] Implement automatic discovery of the Citus distribution rules. With this patch, the following distribution rule distribute companies using id is equivalent to the following distribution rule set, given foreign keys in the source schema: distribute companies using id distribute campaigns using company_id distribute ads using company_id from campaigns distribute clicks using company_id from ads, campaigns distribute impressions using company_id from ads, campaigns In the current code (of this patch) pgloader walks the foreign-keys dependency tree and knows how to automatically derive distribution rules from a single rule and the foreign keys. --- src/load/migrate-database.lisp | 13 +- src/package.lisp | 33 +++-- src/parsers/command-distribute.lisp | 12 +- src/pgsql/pgsql-ddl-citus.lisp | 10 +- src/utils/catalog.lisp | 2 +- src/utils/citus.lisp | 210 ++++++++++++++++++++++------ 6 files changed, 212 insertions(+), 68 deletions(-) diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index d99efbf..0a5452e 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -19,8 +19,7 @@ set-table-oids materialize-views foreign-keys - include-drop - distribute) + include-drop) "Prepare the target PostgreSQL database: create tables casting datatypes from the MySQL definitions, prepare index definitions and create target tables for materialized views. @@ -126,10 +125,10 @@ ;; placement 2299, since DDL has been executed on a connection that is in ;; use ;; - (when distribute + (when (catalog-distribution-rules catalog) (with-pgsql-transaction (:pgconn (target-db copy)) (with-stats-collection ("Citus Distribute Tables" :section :pre) - (create-distributed-table distribute)))) + (create-distributed-table (catalog-distribution-rules catalog))))) ;; log the catalog we just fetched and (maybe) merged (log-message :data "CATALOG: ~s" catalog)) @@ -249,7 +248,8 @@ ;; we also support schema changes necessary for Citus distribution (when distribute - (pgloader.catalog::citus-distribute-schema catalog distribute))) + (setf (catalog-distribution-rules catalog) + (citus-distribute-schema catalog distribute)))) ;;; @@ -363,8 +363,7 @@ :include-drop include-drop :foreign-keys foreign-keys :set-table-oids set-table-oids - :materialize-views materialize-views - :distribute distribute) + :materialize-views materialize-views) ;; if there's an AFTER SCHEMA DO/EXECUTE command, now is the time ;; to run it. diff --git a/src/package.lisp b/src/package.lisp index e1e74bf..0d3e5b6 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -77,6 +77,7 @@ #:catalog-name #:catalog-schema-list #:catalog-types-without-btree + #:catalog-distribution-rules #:schema-name #:schema-catalog @@ -209,14 +210,14 @@ #:match-rule-action #:match-rule-args - #:citus-reference-table - #:citus-distributed-table - #:make-citus-reference-table - #:make-citus-distributed-table - #:citus-reference-table-table - #:citus-distributed-table-table - #:citus-distributed-table-using - #:citus-distributed-table-from + #:citus-reference-rule + #:citus-distributed-rule + #:make-citus-reference-rule + #:make-citus-distributed-rule + #:citus-reference-rule-rule + #:citus-distributed-rule-table + #:citus-distributed-rule-using + #:citus-distributed-rule-from #:citus-format-sql-select #:citus-backfill-table-p @@ -288,6 +289,16 @@ (:export #:*queries* #:sql)) +(defpackage #:pgloader.citus + (:use #:cl + #:pgloader.params + #:pgloader.catalog + #:pgloader.quoting + #:pgloader.monitor) + (:export #:citus-distribute-schema + #:citus-format-sql-select + #:citus-backfill-table-p)) + (defpackage #:pgloader.utils (:use #:cl #:pgloader.params @@ -295,7 +306,8 @@ #:pgloader.quoting #:pgloader.catalog #:pgloader.monitor - #:pgloader.state) + #:pgloader.state + #:pgloader.citus) (:import-from #:alexandria #:appendf #:read-file-into-string) @@ -326,7 +338,8 @@ (cl-user::export-inherited-symbols "pgloader.quoting" "pgloader.utils") (cl-user::export-inherited-symbols "pgloader.catalog" "pgloader.utils") (cl-user::export-inherited-symbols "pgloader.monitor" "pgloader.utils") - (cl-user::export-inherited-symbols "pgloader.state" "pgloader.utils")) + (cl-user::export-inherited-symbols "pgloader.state" "pgloader.utils") + (cl-user::export-inherited-symbols "pgloader.citus" "pgloader.utils")) ;; diff --git a/src/parsers/command-distribute.lisp b/src/parsers/command-distribute.lisp index 6ae0b66..70b7c37 100644 --- a/src/parsers/command-distribute.lisp +++ b/src/parsers/command-distribute.lisp @@ -25,13 +25,13 @@ (defrule distribute-reference (and kw-distribute dsn-table-name kw-as kw-reference kw-table) (:lambda (d-r) - (make-citus-reference-table :table (create-table-from-dsn-table-name d-r)))) + (make-citus-reference-rule :table (create-table-from-dsn-table-name d-r)))) (defrule distribute-using (and kw-distribute dsn-table-name kw-using maybe-quoted-namestring) (:lambda (d-u) - (make-citus-distributed-table :table (create-table-from-dsn-table-name d-u) - :using (make-column :name (fourth d-u))))) + (make-citus-distributed-rule :table (create-table-from-dsn-table-name d-u) + :using (make-column :name (fourth d-u))))) ;;; ;;; The namestring rule allows for commas and we use them as a separator @@ -62,9 +62,9 @@ kw-using maybe-quoted-namestring kw-from distribute-from-list) (:lambda (d-u-f) - (make-citus-distributed-table :table (create-table-from-dsn-table-name d-u-f) - :using (make-column :name (fourth d-u-f)) - :from (mapcar #'create-table (sixth d-u-f))))) + (make-citus-distributed-rule :table (create-table-from-dsn-table-name d-u-f) + :using (make-column :name (fourth d-u-f)) + :from (mapcar #'create-table (sixth d-u-f))))) (defrule distribute-commands (+ (or distribute-using-from distribute-using diff --git a/src/pgsql/pgsql-ddl-citus.lisp b/src/pgsql/pgsql-ddl-citus.lisp index f74ade5..d5cd050 100644 --- a/src/pgsql/pgsql-ddl-citus.lisp +++ b/src/pgsql/pgsql-ddl-citus.lisp @@ -4,15 +4,15 @@ (in-package :pgloader.pgsql) -(defmethod format-create-sql ((rule citus-reference-table) +(defmethod format-create-sql ((rule citus-reference-rule) &key (stream nil) if-not-exists) (declare (ignore if-not-exists)) (format stream "SELECT create_reference_table('~a');" - (format-table-name (citus-reference-table-table rule)))) + (format-table-name (citus-reference-rule-table rule)))) -(defmethod format-create-sql ((rule citus-distributed-table) +(defmethod format-create-sql ((rule citus-distributed-rule) &key (stream nil) if-not-exists) (declare (ignore if-not-exists)) (format stream "SELECT create_distributed_table('~a', '~a');" - (format-table-name (citus-distributed-table-table rule)) - (column-name (citus-distributed-table-using rule)))) + (format-table-name (citus-distributed-rule-table rule)) + (column-name (citus-distributed-rule-using rule)))) diff --git a/src/utils/catalog.lisp b/src/utils/catalog.lisp index 6b29aad..baca81a 100644 --- a/src/utils/catalog.lisp +++ b/src/utils/catalog.lisp @@ -42,7 +42,7 @@ ;;; Column structures details depend on the specific source type and are ;;; implemented in each source separately. ;;; -(defstruct catalog name schema-list types-without-btree) +(defstruct catalog name schema-list types-without-btree distribution-rules) (defstruct schema source-name name catalog in-search-path table-list view-list extension-list sqltype-list) diff --git a/src/utils/citus.lisp b/src/utils/citus.lisp index 0cdcc9b..c1b73b8 100644 --- a/src/utils/citus.lisp +++ b/src/utils/citus.lisp @@ -17,58 +17,182 @@ |# -(in-package #:pgloader.catalog) +(in-package #:pgloader.citus) -(defstruct citus-reference-table table) -(defstruct citus-distributed-table table using from) +;;; +;;; Main data structures to host our distribution rules. +;;; +(defstruct citus-reference-rule table) +(defstruct citus-distributed-rule table using from) (defun citus-distribute-schema (catalog distribution-rules) - "Distribute a CATALOG with given user provided DISTRIBUTION-RULES." - (loop :for rule :in distribution-rules - :do (let ((table (citus-find-table catalog (citus-rule-table rule)))) - (apply-citus-rule rule table)))) + "Distribute a CATALOG with given user provided DISTRIBUTION-RULES. Return + the list of rules applied." + (let ((processed-rules '()) + (derived-rules + (loop :for rule :in distribution-rules + :append (progn + (citus-set-table rule catalog) + (compute-foreign-rules rule (citus-rule-table rule)))))) -(defun citus-rule-table (rule) - (etypecase rule - (citus-reference-table (citus-reference-table-table rule)) - (citus-distributed-table (citus-distributed-table-table rule)))) + ;; + ;; Apply rules only once. + ;; + ;; ERROR Database error 42P16: table ;; "campaigns" is already distributed + ;; + (loop :for rule :in (append distribution-rules derived-rules) + :unless (member (table-oid (citus-rule-table rule)) + processed-rules + :key (lambda (rule) + (table-oid (citus-rule-table rule)))) + :collect (progn + (push rule processed-rules) + (apply-citus-rule rule) + rule)))) (defun citus-find-table (catalog table) (let* ((table-name (table-name table)) (schema-name (schema-name (table-schema table)))) (find-table (find-schema catalog schema-name) table-name))) -(defgeneric apply-citus-rule (rule table) +(defgeneric citus-rule-table (rule) + (:documentation "Returns the RULE's table.") + (:method ((rule citus-reference-rule)) (citus-reference-rule-table rule)) + (:method ((rule citus-distributed-rule)) (citus-distributed-rule-table rule))) + +(defgeneric citus-set-table (rule catalog) + (:documentation "Find citus RULE table in CATALOG and update the + placeholder with the table found there.") + (:method ((rule citus-reference-rule) (catalog catalog)) + (let ((table (citus-reference-rule-table rule))) + (setf (citus-reference-rule-table rule) + (citus-find-table catalog table)))) + + (:method ((rule citus-distributed-rule) (catalog catalog)) + (let ((table (citus-distributed-rule-table rule))) + (map-into (citus-distributed-rule-from rule) + (lambda (from) (citus-find-table catalog from)) + (citus-distributed-rule-from rule)) + (setf (citus-distributed-rule-table rule) + (citus-find-table catalog table))))) + +(defmethod print-object ((table citus-reference-rule) stream) + (print-unreadable-object (table stream :type t :identity t) + (with-slots (table) table + (format stream "distribute ~a as reference" (format-table-name table))))) + +(defmethod print-object ((table citus-distributed-rule) stream) + (print-unreadable-object (table stream :type t :identity t) + (with-slots (table using from) table + (format stream + "distribute ~a :using ~a~@[ :from ~{~a~^, ~}~]" + (format-table-name table) + (column-name using) + (mapcar #'format-table-name from))))) + + +;;; +;;; When distributing a table on a given key, we can follow foreign keys +;;; pointing to this table. We might find out that when computing the +;;; following rule: +;;; +;;; distribute companies using id +;;; +;;; We then want to add the set of rules that we find walking the foreign +;;; keys: +;;; +;;; distribute campaigns using company_id +;;; distribute ads using company_id from campaigns +;;; distribute clicks using company_id from ads, campaigns +;;; distribute impressions using company_id from ads, campaigns +;;; +(defgeneric compute-foreign-rules (rule table &key) + (:documentation + "Compute rules to apply that derive from the distribution rule RULE when + following foreign-keys from TABLE.")) + +(defmethod compute-foreign-rules ((rule citus-reference-rule) + (table table) + &key) + "There's nothing to do here, reference table doesn't impact the schema." + nil) + +(defmethod compute-foreign-rules ((rule citus-distributed-rule) + (table table) + &key fkey-list) + "Find every foreign key that points to TABLE and add return a list of new + rules for the source of those foreign keys." + (let ((pkey (find-if #'index-primary (table-index-list table)))) + + (when (and pkey (member (column-name (citus-distributed-rule-using rule)) + (index-columns pkey) + :test #'string=)) + (loop :for fkey :in (index-fk-deps pkey) + :for new-fkey-list := (cons fkey fkey-list) + :for new-rule := (make-distributed-table-from-fkey rule new-fkey-list) + :collect new-rule :into new-rule-list + :collect (compute-foreign-rules rule (fkey-table fkey) + :fkey-list new-fkey-list) + :into dep-rule-list + :finally (return (append new-rule-list + ;; flatten sub-lists as we go + (apply #'append dep-rule-list))))))) + +(defun make-distributed-table-from-fkey (rule fkey-list) + "Make a new Citus distributed table rule from an existing rule and a fkey + definition." + ;; + ;; We have a list of foreign keys pointing from a current table, + ;; (fkey-table fkey), to the root table that is distributed, + ;; (fkey-foreign-table fkey). + ;; + ;; For the distribution key name, we consider the name of the column used + ;; in the last entry from the fkey-list, the column name that points to + ;; the root.id distribution key and might be named root_id or something. + ;; + ;; Then we only need to specifying USING the intermediate tables, the last + ;; entry gives us the data we need to backfill our tables. + ;; + (let* ((fkey (car (last fkey-list))) + (dist-key (column-name (citus-distributed-rule-using rule))) + (dist-key-pos (position dist-key + (fkey-foreign-columns fkey) + :test #'string=)) + (fkey-table-dist-key (nth dist-key-pos (fkey-columns fkey))) + (from-table-list (butlast (mapcar #'fkey-foreign-table fkey-list)))) + (make-citus-distributed-rule :table (fkey-table (first fkey-list)) + :using (make-column :name fkey-table-dist-key) + :from from-table-list))) + + +;;; +;;; Apply a citus distribution rule to given table, and store the rule +;;; itself to the table-citus-rule slot so that we later know to generate a +;;; proper SELECT query that includes the backfilling. +;;; +(defgeneric apply-citus-rule (rule) (:documentation "Apply a Citus distribution RULE to given TABLE.")) -(defmethod apply-citus-rule ((rule citus-reference-table) (table table)) +(defmethod apply-citus-rule ((rule citus-reference-rule)) ;; for a reference table, we have nothing to do really. - (setf (table-citus-rule table) rule)) - -(defmethod apply-citus-rule ((rule citus-distributed-table) (table table)) - (setf (table-citus-rule table) rule) - - ;; - ;; Replace the TABLE placeholders in the :FROM slot of the rule with the - ;; tables from the catalogs. - ;; - (when (citus-distributed-table-from rule) - (let ((catalog (schema-catalog (table-schema table)))) - (map-into (citus-distributed-table-from rule) - (lambda (from) (citus-find-table catalog from)) - (citus-distributed-table-from rule)))) + (setf (table-citus-rule (citus-reference-rule-table rule)) rule) + t) +(defmethod apply-citus-rule ((rule citus-distributed-rule)) ;; ok now we need to check if the USING column exists or if we need to add ;; it to our model - (let ((column (find (column-name (citus-distributed-table-using rule)) - (table-field-list table) - :test #'string= - :key #'column-name))) + (setf (table-citus-rule (citus-distributed-rule-table rule)) rule) + + (let* ((table (citus-distributed-rule-table rule)) + (column (find (column-name (citus-distributed-rule-using rule)) + (table-field-list table) + :test #'string= + :key #'column-name))) (if column ;; add it to the PKEY definition, in first position (add-column-to-pkey table - (column-name (citus-distributed-table-using rule))) + (column-name (citus-distributed-rule-using rule))) ;; The column doesn't exist, we need to find it in the :FROM rule's ;; list. The :FROM slot of the rule is a list of tables to @@ -78,9 +202,9 @@ ;; ;; To find the column definition to add to the current TABLE, look ;; it up in the last entry of the FROM rule's list. - (let* ((last-from-rule (car (last (citus-distributed-table-from rule)))) + (let* ((last-from-rule (car (last (citus-distributed-rule-from rule)))) (column-definition - (find (column-name (citus-distributed-table-using rule)) + (find (column-name (citus-distributed-rule-using rule)) (table-field-list last-from-rule) :test #'string= :key #'column-name)) @@ -122,13 +246,18 @@ :do (push column-name (fkey-foreign-columns fkey)) :do (setf (fkey-condef fkey) nil))))) - + +;;; +;;; Format a query for backfilling the data right from pgloader: +;;; +;;; SELECT dist_key, * FROM source JOIN pivot ON ... +;;; (defun format-citus-join-clause (table distribution-rule) "Format a JOIN clause to backfill the distribution key data in tables that are referencing (even indirectly) the main distribution table." (with-output-to-string (s) (loop :for current-table := table :then rel - :for rel :in (citus-distributed-table-from distribution-rule) + :for rel :in (citus-distributed-rule-from distribution-rule) :do (let* ((fkey (find (ensure-unquoted (table-name rel)) (table-fkey-list current-table) @@ -171,7 +300,7 @@ ;; in the rule). ;; (let* ((last-from-rule - (car (last (citus-distributed-table-from + (car (last (citus-distributed-rule-from (table-citus-rule target-table))))) (cols (append (list @@ -193,9 +322,12 @@ (table-source-name source-table) joins))) +;;; +;;; Predicate to see if a table needs backfilling +;;; (defun citus-backfill-table-p (table) "Returns non-nil when given TABLE should be backfilled with the distribution key." (and (table-citus-rule table) - (typep (table-citus-rule table) 'citus-distributed-table) - (not (null (citus-distributed-table-from (table-citus-rule table)))))) + (typep (table-citus-rule table) 'citus-distributed-rule) + (not (null (citus-distributed-rule-from (table-citus-rule table)))))) From 7b487ddacaf38a96acac29c29e815f0d608b59b5 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 18 Oct 2018 15:42:17 +0200 Subject: [PATCH 19/69] Add a Citus distribution test case, from the citus tutorial. --- test/citus/.gitignore | 1 + test/citus/Makefile | 20 ++++++++++++ test/citus/README.md | 42 +++++++++++++++++++++++++ test/citus/company.load | 12 ++++++++ test/citus/company.sql | 51 +++++++++++++++++++++++++++++++ test/citus/copy.sql | 5 +++ test/citus/data.load | 68 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 199 insertions(+) create mode 100644 test/citus/.gitignore create mode 100644 test/citus/Makefile create mode 100644 test/citus/README.md create mode 100644 test/citus/company.load create mode 100644 test/citus/company.sql create mode 100644 test/citus/copy.sql create mode 100644 test/citus/data.load diff --git a/test/citus/.gitignore b/test/citus/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/test/citus/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/test/citus/Makefile b/test/citus/Makefile new file mode 100644 index 0000000..0c4c5b9 --- /dev/null +++ b/test/citus/Makefile @@ -0,0 +1,20 @@ +DATASET = companies campaigns ads clicks impressions geo_ips +CSV = $(addsuffix .csv,$(DATASET)) +DROP = DROP TABLE IF EXISTS companies, campaigns, ads, clicks, impressions, geo_ips + +all: schema data ; + +schema: + psql --single-transaction -c "$(DROP)" -d hackathon + psql --single-transaction -f company.sql -d hackathon + +data: fetch + psql -f copy.sql -d hackathon + ../../build/bin/pgloader ./data.load + +fetch: $(CSV) ; + +%.csv: + curl -O https://examples.citusdata.com/mt_ref_arch/$@ + +.PHONY: schema data fetch diff --git a/test/citus/README.md b/test/citus/README.md new file mode 100644 index 0000000..499ecd6 --- /dev/null +++ b/test/citus/README.md @@ -0,0 +1,42 @@ +# Citus Multi-Tenant Automatic Distribution + +In this test case we follow the following documentation: + + https://docs.citusdata.com/en/v7.5/use_cases/multi_tenant.html + +We install the schema before Citus migration, and load the data without the +backfilling that is already done. For that we use pgloader to ignore the +company_id column in the tables that didn't have this column prior to the +Citus migration effort. + +Then the following `company.load` file contains the pgloader command that +runs a full migration from PostgreSQL to Citus: + +``` +load database + from pgsql:///hackathon + into pgsql://localhost:9700/dim + + with include drop, reset no sequences + + distribute companies using id; +``` + +Tables are marked distributed, the company_id column is added where it's +needed, primary keys and foreign keys definitions are altered to the new +model, and finally the data is backfilled automatically in the target table +thanks to generating queries like the following: + +~~~ +SELECT "campaigns".company_id::text, + "impressions".id::text, + "impressions".ad_id::text, + "impressions".seen_at::text, + "impressions".site_url::text, + "impressions".cost_per_impression_usd::text, + "impressions".user_ip::text, + "impressions".user_data::text + FROM "public"."impressions" + JOIN "public"."ads" ON impressions.ad_id = ads.id + JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id +~~~ diff --git a/test/citus/company.load b/test/citus/company.load new file mode 100644 index 0000000..ef4af21 --- /dev/null +++ b/test/citus/company.load @@ -0,0 +1,12 @@ +load database + from pgsql:///hackathon + into pgsql://localhost:9700/dim + + with include drop, reset no sequences + + distribute companies using id + -- distribute campaigns using company_id + -- distribute ads using company_id from campaigns + -- distribute clicks using company_id from ads, campaigns + -- distribute impressions using company_id from ads, campaigns + ; diff --git a/test/citus/company.sql b/test/citus/company.sql new file mode 100644 index 0000000..dad23dc --- /dev/null +++ b/test/citus/company.sql @@ -0,0 +1,51 @@ +CREATE TABLE companies ( + id bigserial PRIMARY KEY, + name text NOT NULL, + image_url text, + created_at timestamp without time zone NOT NULL, + updated_at timestamp without time zone NOT NULL +); + +CREATE TABLE campaigns ( + id bigserial PRIMARY KEY, + company_id bigint REFERENCES companies (id), + name text NOT NULL, + cost_model text NOT NULL, + state text NOT NULL, + monthly_budget bigint, + blacklisted_site_urls text[], + created_at timestamp without time zone NOT NULL, + updated_at timestamp without time zone NOT NULL +); + +CREATE TABLE ads ( + id bigserial PRIMARY KEY, + campaign_id bigint REFERENCES campaigns (id), + name text NOT NULL, + image_url text, + target_url text, + impressions_count bigint DEFAULT 0, + clicks_count bigint DEFAULT 0, + created_at timestamp without time zone NOT NULL, + updated_at timestamp without time zone NOT NULL +); + +CREATE TABLE clicks ( + id bigserial PRIMARY KEY, + ad_id bigint REFERENCES ads (id), + clicked_at timestamp without time zone NOT NULL, + site_url text NOT NULL, + cost_per_click_usd numeric(20,10), + user_ip inet NOT NULL, + user_data jsonb NOT NULL +); + +CREATE TABLE impressions ( + id bigserial PRIMARY KEY, + ad_id bigint REFERENCES ads (id), + seen_at timestamp without time zone NOT NULL, + site_url text NOT NULL, + cost_per_impression_usd numeric(20,10), + user_ip inet NOT NULL, + user_data jsonb NOT NULL +); diff --git a/test/citus/copy.sql b/test/citus/copy.sql new file mode 100644 index 0000000..684f891 --- /dev/null +++ b/test/citus/copy.sql @@ -0,0 +1,5 @@ +\copy companies from 'companies.csv' with csv +\copy campaigns from 'campaigns.csv' with csv +-- \copy ads from 'ads.csv' with csv +-- \copy clicks from 'clicks.csv' with csv +-- \copy impressions from 'impressions.csv' with csv diff --git a/test/citus/data.load b/test/citus/data.load new file mode 100644 index 0000000..cbb29b0 --- /dev/null +++ b/test/citus/data.load @@ -0,0 +1,68 @@ +-- +-- Ads +-- +load csv + from ads.csv + ( + id, company_id, campaign_id, name, image_url, target_url, + impressions_count, clicks_count, created_at, updated_at + ) + + into postgresql:///hackathon + + target table ads + target columns + ( + id, campaign_id, name, image_url, target_url, + impressions_count, clicks_count, created_at, updated_at + ) + + with fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ','; + +-- +-- Clicks +-- +load csv + from clicks.csv + ( + id, company_id, ad_id, clicked_at, site_url, cost_per_click_usd, + user_ip, user_data + ) + + into postgresql:///hackathon + + target table clicks + target columns + ( + id, ad_id, clicked_at, site_url, cost_per_click_usd, user_ip, user_data + ) + + with fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ','; + + +-- +-- Impressions +-- +load csv + from impressions.csv + ( + id, company_id, ad_id, seen_at, site_url, + cost_per_impression_usd, user_ip, user_data + ) + + into postgresql:///hackathon + + target table impressions + target columns + ( + id, ad_id, seen_at, site_url, cost_per_impression_usd, user_ip, user_data + ) + + with drop indexes, + fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ','; \ No newline at end of file From 0e6f599282e9f799ec47e8c32dde95d3b0c13201 Mon Sep 17 00:00:00 2001 From: Larry Gebhardt Date: Thu, 18 Oct 2018 12:55:56 -0400 Subject: [PATCH 20/69] Add Docker build instructions (#853) --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index d99cc2d..a3b3049 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,16 @@ pgloader: +### Building Docker image from sources + +You can build a Docker image from source using SBCL by default: + + $ docker build . + +Or Clozure CL (CCL): + + $ docker build -f Dockerfile.ccl . + ## More options when building from source The `Makefile` target `pgloader` knows how to produce a Self Contained From 6e7ea9080693c68368fc13075a5360e2bd37bec7 Mon Sep 17 00:00:00 2001 From: Jason Rigby Date: Fri, 19 Oct 2018 03:56:40 +1100 Subject: [PATCH 21/69] add cl-ironclad and cl-babel dependencies to docker builds (#854) --- Dockerfile | 4 +++- Dockerfile.ccl | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0500aa2..ea6a08c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,8 @@ FROM debian:stable-slim as builder time \ unzip \ wget \ + cl-ironclad \ + cl-babel \ && rm -rf /var/lib/apt/lists/* COPY ./ /opt/src/pgloader @@ -42,4 +44,4 @@ FROM debian:stable-slim COPY --from=builder /opt/src/pgloader/build/bin/pgloader /usr/local/bin - LABEL maintainer="Dimitri Fontaine " \ No newline at end of file + LABEL maintainer="Dimitri Fontaine " diff --git a/Dockerfile.ccl b/Dockerfile.ccl index f88468a..9377fe0 100644 --- a/Dockerfile.ccl +++ b/Dockerfile.ccl @@ -18,6 +18,8 @@ FROM debian:stable-slim as builder time \ unzip \ wget \ + cl-ironclad \ + cl-babel \ && rm -rf /var/lib/apt/lists/* RUN curl -SL https://github.com/Clozure/ccl/releases/download/v1.11.5/ccl-1.11.5-linuxx86.tar.gz \ @@ -46,4 +48,4 @@ FROM debian:stable-slim COPY --from=builder /opt/src/pgloader/build/bin/pgloader /usr/local/bin - LABEL maintainer="Dimitri Fontaine " \ No newline at end of file + LABEL maintainer="Dimitri Fontaine " From f8460c17056df902a10bb08596beed610749a2af Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sat, 20 Oct 2018 19:28:19 +0200 Subject: [PATCH 22/69] Allow usernames and dbnames starting with digits (again). It turns out that the rules about the names of users and databases are more lax than pgloader would know, so it might be a good move for our DSN parsing to accept more values and then let the source/target systems to complain when something goes wrong. See #230 which got broke again somewhere. --- src/parsers/command-db-uri.lisp | 11 ++++++----- src/parsers/command-mysql.lisp | 5 +---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/parsers/command-db-uri.lisp b/src/parsers/command-db-uri.lisp index 7420e01..339d45f 100644 --- a/src/parsers/command-db-uri.lisp +++ b/src/parsers/command-db-uri.lisp @@ -25,7 +25,7 @@ (defrule doubled-at-sign (and "@@") (:constant "@")) (defrule doubled-colon (and "::") (:constant ":")) (defrule password (+ (or (not "@") doubled-at-sign)) (:text t)) -(defrule username (and (or #\_ (alpha-char-p character)) +(defrule username (and (or #\_ (alpha-char-p character) (digit-char-p character)) (* (or (alpha-char-p character) (digit-char-p character) #\. @@ -87,10 +87,11 @@ (append (list :host (when host (process-hostname host))) port)))) -(defrule dsn-dbname (and "/" (? maybe-quoted-namestring)) - (:destructure (slash dbname) - (declare (ignore slash)) - (list :dbname dbname))) +(defrule dsn-dbname (and "/" (? (* (or (alpha-char-p character) + (digit-char-p character) + punct)))) + (:lambda (dbn) + (list :dbname (text (second dbn))))) (defrule dsn-option-ssl-disable "disable" (:constant :no)) (defrule dsn-option-ssl-allow "allow" (:constant :try)) diff --git a/src/parsers/command-mysql.lisp b/src/parsers/command-mysql.lisp index 703515f..bbae776 100644 --- a/src/parsers/command-mysql.lisp +++ b/src/parsers/command-mysql.lisp @@ -95,9 +95,6 @@ (defrule mysql-prefix "mysql://" (:constant (list :type :mysql))) -(defrule mysql-dsn-dbname (and "/" maybe-quoted-namestring) - (:lambda (m-d-d) (list :dbname (text (second m-d-d))))) - (defrule mysql-dsn-option-usessl-true "true" (:constant :yes)) (defrule mysql-dsn-option-usessl-false "false" (:constant :no)) @@ -123,7 +120,7 @@ (defrule mysql-uri (and mysql-prefix (? dsn-user-password) (? dsn-hostname) - mysql-dsn-dbname + dsn-dbname (? mysql-dsn-options)) (:lambda (uri) (destructuring-bind (&key type From 207cd82726ef5a88554bf2bf59303a876027d704 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 7 Nov 2018 11:01:06 +0100 Subject: [PATCH 23/69] Improve SQLite type names parsing. Allow spaces in more random places, as SQLite doesn't seem to normalize the user input. Fixes #548 again. --- src/parsers/command-csv.lisp | 5 ----- src/parsers/command-utils.lisp | 8 ++++++++ src/parsers/parse-sqlite-type-name.lisp | 14 ++++++++------ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/parsers/command-csv.lisp b/src/parsers/command-csv.lisp index 1c2afa4..c7e4e11 100644 --- a/src/parsers/command-csv.lisp +++ b/src/parsers/command-csv.lisp @@ -231,11 +231,6 @@ (destructuring-bind (field1 fields) source (list* field1 fields)))) -(defrule open-paren (and ignore-whitespace #\( ignore-whitespace) - (:constant :open-paren)) -(defrule close-paren (and ignore-whitespace #\) ignore-whitespace) - (:constant :close-paren)) - (defrule having-fields (and kw-having kw-fields) (:constant nil)) (defrule csv-source-field-list (and (? having-fields) diff --git a/src/parsers/command-utils.lisp b/src/parsers/command-utils.lisp index 4383ab3..ebc476d 100644 --- a/src/parsers/command-utils.lisp +++ b/src/parsers/command-utils.lisp @@ -57,3 +57,11 @@ quoted-namestring namestring)) +(defrule open-paren (and ignore-whitespace #\( ignore-whitespace) + (:constant :open-paren)) + +(defrule close-paren (and ignore-whitespace #\) ignore-whitespace) + (:constant :close-paren)) + +(defrule comma-separator (and ignore-whitespace #\, ignore-whitespace) + (:constant ",")) diff --git a/src/parsers/parse-sqlite-type-name.lisp b/src/parsers/parse-sqlite-type-name.lisp index 81d9a0a..09eb0e8 100644 --- a/src/parsers/parse-sqlite-type-name.lisp +++ b/src/parsers/parse-sqlite-type-name.lisp @@ -15,14 +15,16 @@ (? " ")) (:lambda (noise) (second noise))) -(defrule sqlite-single-typemod (and #\( (+ (digit-char-p character)) #\)) +(defrule sqlite-single-typemod (and open-paren + (+ (digit-char-p character)) + close-paren) (:lambda (st) (cons (parse-integer (text (second st))) nil))) -(defrule sqlite-double-typemod (and #\( +(defrule sqlite-double-typemod (and open-paren (+ (digit-char-p character)) - (* (or #\, #\Space)) + comma-separator (+ (digit-char-p character)) - #\)) + close-paren) (:lambda (dt) (cons (parse-integer (text (second dt))) (parse-integer (text (fourth dt)))))) @@ -31,9 +33,9 @@ (defrule sqlite-type-name (and (* extra-qualifiers) (+ (alpha-char-p character)) (* extra-qualifiers) - (* #\Space) + ignore-whitespace (? sqlite-typemod) - (* #\Space) + ignore-whitespace (* extra-qualifiers)) (:lambda (tn) (list (text (second tn)) (fifth tn) From 794bc7fc6436ca05a4208fbb5357b8d1791a9b3a Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 7 Nov 2018 21:05:59 +0100 Subject: [PATCH 24/69] Improve redshift support: string_agg() doesn't exist there. Neither does array_agg(), unnest() and other very useful PostgreSQL functions. Redshift is from 8.0 times, so do things the old way: parse the output of the index definition that get from calling pg_index_def(). For that, this patch introduces the notion of SQL support that depends on PostgreSQL major version. If no major-version specific query is found in the pgloader source tree, then we use the generic one. Fixes #860. --- src/load/load-file.lisp | 3 +- src/load/migrate-database.lisp | 8 +++-- src/package.lisp | 3 +- src/pgsql/pgsql-schema.lisp | 42 +++++++++++++++++++++++--- src/pgsql/sql/8.0/README.md | 4 +++ src/pgsql/sql/8.0/list-all-indexes.sql | 29 ++++++++++++++++++ src/sources/pgsql/pgsql.lisp | 6 ++-- src/utils/queries.lisp | 19 ++++++++++++ 8 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 src/pgsql/sql/8.0/README.md create mode 100644 src/pgsql/sql/8.0/list-all-indexes.sql diff --git a/src/load/load-file.lisp b/src/load/load-file.lisp index 7d36d12..02ddb45 100644 --- a/src/load/load-file.lisp +++ b/src/load/load-file.lisp @@ -44,7 +44,8 @@ (setf pgsql-catalog (fetch-pgsql-catalog (db-name pgconn) :table (target copy) - :variant (pgconn-variant pgconn))) + :variant (pgconn-variant pgconn) + :pgversion (pgconn-major-version pgconn))) ;; if the user didn't tell us the column list of the table, now is ;; a proper time to set it in the copy object diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index 0a5452e..3037571 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -70,9 +70,11 @@ ;; ;; to be able to do that properly, get the constraints from ;; the pre-existing target database catalog - (let ((pgsql-catalog - (fetch-pgsql-catalog (db-name (target-db copy)) - :source-catalog catalog))) + (let* ((pgversion (pgconn-major-version (target-db copy))) + (pgsql-catalog + (fetch-pgsql-catalog (db-name (target-db copy)) + :source-catalog catalog + :pgversion pgversion))) (merge-catalogs catalog pgsql-catalog)) ;; now the foreign keys and only then the indexes, because a diff --git a/src/package.lisp b/src/package.lisp index 0d3e5b6..20820ab 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -287,7 +287,8 @@ (defpackage #:pgloader.queries (:use #:cl #:pgloader.params) (:export #:*queries* - #:sql)) + #:sql + #:sql-url-for-variant)) (defpackage #:pgloader.citus (:use #:cl diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index 59f33f0..0b98459 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -10,7 +10,8 @@ source-catalog including excluding - (variant :pgdg)) + (variant :pgdg) + pgversion) "Fetch PostgreSQL catalogs for the target database. A PostgreSQL connection must be opened." (let* ((*identifier-case* :quote) @@ -35,7 +36,8 @@ (list-all-indexes catalog :including including - :excluding excluding) + :excluding excluding + :pgversion pgversion) (when (eq :pgdg variant) (list-all-fkeys catalog @@ -193,7 +195,7 @@ (add-field table field)) :finally (return catalog))) -(defun list-all-indexes (catalog &key including excluding) +(defun list-all-indexes (catalog &key including excluding pgversion) "Get the list of PostgreSQL index definitions per table." (loop :for (schema-name name oid @@ -201,7 +203,9 @@ primary unique cols sql conname condef) :in (query nil (format nil - (sql "/pgsql/list-all-indexes.sql") + (sql (sql-url-for-variant "pgsql" + "list-all-indexes.sql" + pgversion)) including ; do we print the clause? (filter-list-to-where-clause including nil @@ -215,6 +219,7 @@ :do (let* ((schema (find-schema catalog schema-name)) (tschema (find-schema catalog table-schema)) (table (find-table tschema table-name)) + (columns (parse-index-column-names cols sql)) (pg-index (make-index :name (ensure-quoted name) :oid oid @@ -222,7 +227,7 @@ :table table :primary primary :unique unique - :columns (split-sequence:split-sequence #\, cols) + :columns columns :sql sql :conname (unless (eq :null conname) (ensure-quoted conname)) @@ -438,3 +443,30 @@ ;; going to take care of creating the type. (add-sqltype schema sqltype))) :finally (return catalog))) + + + +;;; +;;; Extra utils like parsing a list of column names from an index definition. +;;; +(defun parse-index-column-names (columns index-definition) + "Return a list of column names for the given index." + (if (and columns (not (eq :null columns))) + ;; the normal case, no much parsing to do, the data has been prepared + ;; for us in the SQL query + (split-sequence:split-sequence #\, columns) + + ;; the redshift variant case, where there's no way to string_agg or + ;; even array_to_string(array_agg(...)) and so we need to parse the + ;; index-definition instead. + ;; + ;; CREATE UNIQUE INDEX pg_amproc_opc_proc_index ON pg_amproc USING btree (amopclaid, amprocsubtype, amprocnum) + (when index-definition + (let ((open-paren-pos (position #\( index-definition)) + (close-paren-pos (position #\) index-definition))) + (when (and open-paren-pos close-paren-pos) + (mapcar (lambda (colname) (string-trim " " colname)) + (split-sequence:split-sequence #\, + index-definition + :start (+ 1 open-paren-pos) + :end close-paren-pos))))))) diff --git a/src/pgsql/sql/8.0/README.md b/src/pgsql/sql/8.0/README.md new file mode 100644 index 0000000..dc4eddb --- /dev/null +++ b/src/pgsql/sql/8.0/README.md @@ -0,0 +1,4 @@ +Redshift is a fork of PostgreSQL 8.0, and our catalog queries must then +target this old PostgreSQL version to work on Redshift. Parts of what we +would usually implement in SQL is implemented in pgloader code instead, in +order to support such an old PostgreSQL version. diff --git a/src/pgsql/sql/8.0/list-all-indexes.sql b/src/pgsql/sql/8.0/list-all-indexes.sql new file mode 100644 index 0000000..2efc8ce --- /dev/null +++ b/src/pgsql/sql/8.0/list-all-indexes.sql @@ -0,0 +1,29 @@ +-- params: including +-- filter-list-to-where-clause for including +-- excluding +-- filter-list-to-where-clause for excluding + select n.nspname, + i.relname, + i.oid, + rn.nspname, + r.relname, + indisprimary, + indisunique, + null, + pg_get_indexdef(indexrelid), + c.conname, + pg_get_constraintdef(c.oid) + from pg_index x + join pg_class i ON i.oid = x.indexrelid + join pg_class r ON r.oid = x.indrelid + join pg_namespace n ON n.oid = i.relnamespace + join pg_namespace rn ON rn.oid = r.relnamespace + left join pg_depend d on d.classid = 'pg_class'::regclass + and d.objid = i.oid + and d.refclassid = 'pg_constraint'::regclass + and d.deptype = 'i' + left join pg_constraint c ON c.oid = d.refobjid + where n.nspname !~~ '^pg_' and n.nspname <> 'information_schema' + ~:[~*~;and (~{~a~^~&~10t or ~})~] + ~:[~*~;and (~{~a~^~&~10t and ~})~] +order by n.nspname, r.relname; diff --git a/src/sources/pgsql/pgsql.lisp b/src/sources/pgsql/pgsql.lisp index da6d611..d62038c 100644 --- a/src/sources/pgsql/pgsql.lisp +++ b/src/sources/pgsql/pgsql.lisp @@ -82,7 +82,8 @@ :use-result-as-read t :section :pre) (with-pgsql-transaction (:pgconn (source-db pgsql)) - (let ((variant (pgconn-variant (source-db pgsql)))) + (let ((variant (pgconn-variant (source-db pgsql))) + (pgversion (pgconn-major-version (source-db pgsql)))) (when (eq :pgdg variant) (list-all-sqltypes catalog :including including @@ -95,7 +96,8 @@ (when create-indexes (list-all-indexes catalog :including including - :excluding excluding)) + :excluding excluding + :pgversion pgversion)) (when (and (eq :pgdg variant) foreign-keys) (list-all-fkeys catalog diff --git a/src/utils/queries.lisp b/src/utils/queries.lisp index b0364d1..45a3c61 100644 --- a/src/utils/queries.lisp +++ b/src/utils/queries.lisp @@ -66,3 +66,22 @@ (recompute-fs-and-retry () (setf *fs* (walk-sources-and-build-fs)) (sql url)))) + +(defun sql-url-for-variant (base filename &optional variant) + "Build a SQL URL for given VARIANT" + (flet ((sql-base-url (base filename) + (format nil "/~a/~a" base filename))) + (if variant + (let ((sql-variant-url + (format nil "/~a/~a/~a" + base + (string-downcase (typecase variant + (symbol (symbol-name variant)) + (string variant) + (t (princ-to-string variant)))) + filename))) + (if (gethash sql-variant-url *fs*) + sql-variant-url + (sql-base-url base filename))) + + (sql-base-url base filename)))) From 6c804042490e978b1cc630395ff294be2a043914 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 9 Nov 2018 22:41:14 +0100 Subject: [PATCH 25/69] Implement support for Redshift "identity" columns. At this stage we don't even parse the details of the Redshift identity such as the seed and step values and consider them the same as a MySQL auto_increment extra description field. Fixes #860 (again). --- src/pgsql/pgsql-finalize-catalogs.lisp | 6 +++++- src/sources/pgsql/pgsql-cast-rules.lisp | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/pgsql/pgsql-finalize-catalogs.lisp b/src/pgsql/pgsql-finalize-catalogs.lisp index 5a4eeef..5684100 100644 --- a/src/pgsql/pgsql-finalize-catalogs.lisp +++ b/src/pgsql/pgsql-finalize-catalogs.lisp @@ -15,6 +15,8 @@ (in-package #:pgloader.pgsql) (defun finalize-catalogs (catalog variant) + "Finalize the target PostgreSQL catalogs, dumbing down datatypes when the + target actually is Redshift rather than core PostgreSQL." ;; ;; For Core PostgreSQL, we also want to find data types names that have ;; no Btree support and fetch alternatives. This allows for supporting @@ -30,7 +32,9 @@ ;; (adjust-data-types catalog variant)) -(defgeneric adjust-data-types (catalog variant)) +(defgeneric adjust-data-types (catalog variant) + (:documentation + "Adjust PostgreSQL data types depending on the variant we target.")) ;;; ;;; Nothing needs to be done for PostgreSQL variant :pgdg, of course. diff --git a/src/sources/pgsql/pgsql-cast-rules.lisp b/src/sources/pgsql/pgsql-cast-rules.lisp index 6ac37ee..6c0690e 100644 --- a/src/sources/pgsql/pgsql-cast-rules.lisp +++ b/src/sources/pgsql/pgsql-cast-rules.lisp @@ -36,13 +36,16 @@ pgloader.catalog::extra) field (let* ((ctype (pgsql-column-ctype field)) + (extra (when (and (stringp (column-default field)) + (search "identity" (column-default field))) + :auto-increment)) (pgcol (apply-casting-rules nil pgloader.catalog::name pgloader.catalog::type-name ctype pgloader.catalog::default pgloader.catalog::nullable - pgloader.catalog::extra))) + extra))) ;; re-install our instruction not to transform default value: it comes ;; from PostgreSQL, and we trust it. (setf (column-transform-default pgcol) @@ -55,10 +58,16 @@ ((and (stringp default) (string= "NULL" default)) :null) - ((and (stringp default) - (or (string= "getdate()" default))) + ((and (stringp default) (string= "getdate()" default)) :current-timestamp) + ;; get rid of the identity default value, we already added + ;; an hint in the column-extra field. + ;; + ;; "identity"(347358, 0, ('1,1'::character varying)::text) + ((and (stringp default) (search "identity" default)) + :null) + (t (column-default pgcol)))) ;; we usually trust defaults that come from PostgreSQL... but we From 6eaad0621bc6661c314f21f5da158d27282dc02c Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 9 Nov 2018 22:42:31 +0100 Subject: [PATCH 26/69] Desultory code maintenance for MS SQL identity support. The code expects the keyword :auto-increment rather than a string nowadays in order to process an extra column bits of information as meaning that we want to cast to a serial/bigserial datatype. --- src/sources/mssql/mssql-cast-rules.lisp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sources/mssql/mssql-cast-rules.lisp b/src/sources/mssql/mssql-cast-rules.lisp index 471740b..dec7ee8 100644 --- a/src/sources/mssql/mssql-cast-rules.lisp +++ b/src/sources/mssql/mssql-cast-rules.lisp @@ -125,7 +125,7 @@ field (declare (ignore schema)) ; FIXME (let* ((ctype (mssql-column-ctype field)) - (extra (when (mssql-column-identity field) "auto_increment")) + (extra (when (mssql-column-identity field) :auto-increment)) (pgcol (apply-casting-rules table-name name type ctype default nullable extra))) ;; the MS SQL driver smartly maps data to the proper CL type, but the From 656bf850752ab06435870d2e68696ff12a285362 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sat, 10 Nov 2018 20:22:04 +0100 Subject: [PATCH 27/69] Review field to column projection code emitted. The code emitted by pgloader to transform input fields into PostgreSQL column values was using too many optimization declarations, some of them that SBCL failed to follow through for lack of type marking in the generated code. As SBCL doesn't have enough information to be optimizing anyway, at least we can make it so that we don't have a warning about it. The new code does that. Fixes #803. --- src/sources/common/project-fields.lisp | 4 ++-- test/allcols.load | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sources/common/project-fields.lisp b/src/sources/common/project-fields.lisp index 0ab57ff..f82d46c 100644 --- a/src/sources/common/project-fields.lisp +++ b/src/sources/common/project-fields.lisp @@ -20,7 +20,6 @@ nil col)) (lambda (col) - (declare (optimize speed)) (if (string= null-as col) nil col)))) (field-name-as-symbol (field-name-or-list) @@ -120,7 +119,8 @@ (destructuring-bind (&optional ,@args &rest extra) row (declare (ignorable ,@args) (ignore extra)) (let ,values - (declare (ignorable ,@args)) + (declare (ignorable ,@args) + (type vector ,@args)) (vector ,@newrow))))))))) ;; allow for some debugging (if compile (compile nil projection) projection)))) diff --git a/test/allcols.load b/test/allcols.load index e5c4e29..2598466 100644 --- a/test/allcols.load +++ b/test/allcols.load @@ -13,7 +13,7 @@ LOAD CSV FROM inline (a, b, c) - INTO postgresql:///pgloader?allcols (a, b, c) + INTO postgresql:///pgloader?allcols (a, b, c text using (subseq c 0)) WITH fields optionally enclosed by '"', fields escaped by double-quote, From a6ef7a56a99d84d7279ebbdc24585f0aece8a3d5 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sat, 10 Nov 2018 21:01:30 +0100 Subject: [PATCH 28/69] Implement ipv6 hostname support in .pgpass rules. An hostname could be written [::1] in .pgass, without having to escape the colon characters, and with a proper enclosing in square brackets, as common for ipv6 addresses. Fixes #837. --- src/parsers/parse-pgpass.lisp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/parsers/parse-pgpass.lisp b/src/parsers/parse-pgpass.lisp index 82efbee..5f62aba 100644 --- a/src/parsers/parse-pgpass.lisp +++ b/src/parsers/parse-pgpass.lisp @@ -14,8 +14,14 @@ (defrule pgpass-escaped-char (and #\\ (or #\\ #\:)) (:lambda (c) (second c))) +(defrule pgpass-ipv6-hostname (and #\[ + (+ (or (digit-char-p character) ":")) + #\]) + (:lambda (ipv6) (text (second ipv6)))) + (defrule pgpass-entry (or "*" - (+ (or pgpass-escaped-char + (+ (or pgpass-ipv6-hostname + pgpass-escaped-char (pgpass-char-p character)))) (:lambda (e) (text e))) From 5ecf04acb910aba17cd5e2001ccb6c8faef37468 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 13 Nov 2018 21:35:48 +0100 Subject: [PATCH 29/69] Implement null if support as a WITH option. This gives a default "null if" option to all the input columns at once, and it's still possible to override the default per column. In passing, fix project-fields declarations that SBCL now complains about when they're not true, such as declaring a vector when we might have :null or nil. As a result, remove the (declare (optimize speed)) in the generated field processing code. --- src/parsers/command-csv.lisp | 52 +++++++++++++++----------- src/sources/common/project-fields.lisp | 5 +-- test/csv-null-if.load | 22 +++++++++++ 3 files changed, 55 insertions(+), 24 deletions(-) create mode 100644 test/csv-null-if.load diff --git a/src/parsers/command-csv.lisp b/src/parsers/command-csv.lisp index c7e4e11..0df8dd3 100644 --- a/src/parsers/command-csv.lisp +++ b/src/parsers/command-csv.lisp @@ -134,7 +134,8 @@ option-fields-terminated-by option-trim-unquoted-blanks option-keep-unquoted-blanks - option-csv-escape-mode)) + option-csv-escape-mode + option-null-if)) (defrule csv-options (and kw-with (and csv-option (* (and comma csv-option)))) @@ -429,26 +430,35 @@ (progn ,(sql-code-block pg-db-conn :pre before "before load") - (let ((on-error-stop (getf ',options :on-error-stop)) - (truncate (getf ',options :truncate)) - (disable-triggers (getf ',options :disable-triggers)) - (drop-indexes (getf ',options :drop-indexes)) - (max-parallel-create-index (getf ',options :max-parallel-create-index)) - (source - (make-instance 'copy-csv - :target-db ,pg-db-conn - :source source-db - :target (create-table ',target-table-name) - :encoding ,encoding - :fields ',fields - :columns ',columns - ,@(remove-batch-control-option - options :extras '(:worker-count - :concurrency - :truncate - :drop-indexes - :disable-triggers - :max-parallel-create-index))))) + (let* ((on-error-stop (getf ',options :on-error-stop)) + (truncate (getf ',options :truncate)) + (disable-triggers (getf ',options :disable-triggers)) + (drop-indexes (getf ',options :drop-indexes)) + (max-parallel-create-index (getf ',options :max-parallel-create-index)) + (fields + ',(let ((null-as (getf options :null-as))) + (if null-as + (mapcar (lambda (field) + (if (member :null-as field) field + (append field (list :null-as null-as)))) + fields) + fields))) + (source + (make-instance 'copy-csv + :target-db ,pg-db-conn + :source source-db + :target (create-table ',target-table-name) + :encoding ,encoding + :fields fields + :columns ',columns + ,@(remove-batch-control-option + options :extras '(:null-as + :worker-count + :concurrency + :truncate + :drop-indexes + :disable-triggers + :max-parallel-create-index))))) (copy-database source ,@ (when worker-count (list :worker-count worker-count)) diff --git a/src/sources/common/project-fields.lisp b/src/sources/common/project-fields.lisp index f82d46c..dc47197 100644 --- a/src/sources/common/project-fields.lisp +++ b/src/sources/common/project-fields.lisp @@ -115,12 +115,11 @@ sexp)) (t sexp))))) `(lambda (row) - (declare (optimize speed) (type list row)) + (declare (type list row)) (destructuring-bind (&optional ,@args &rest extra) row (declare (ignorable ,@args) (ignore extra)) (let ,values - (declare (ignorable ,@args) - (type vector ,@args)) + (declare (ignorable ,@args)) (vector ,@newrow))))))))) ;; allow for some debugging (if compile (compile nil projection) projection)))) diff --git a/test/csv-null-if.load b/test/csv-null-if.load new file mode 100644 index 0000000..c35d24b --- /dev/null +++ b/test/csv-null-if.load @@ -0,0 +1,22 @@ +LOAD CSV + FROM INLINE (id, number, data) + INTO postgresql:///pgloader?nullif + + BEFORE LOAD DO + $$ drop table if exists nullif; $$, + $$ CREATE TABLE nullif + ( + id serial primary key, + number integer, + data text + ); + $$ + + WITH null if '\N', + fields terminated by ',', + fields enclosed by '"', + fields escaped by backslash-quote; + + +"1",\N,"testing nulls" +"2","2","another test" \ No newline at end of file From 16dda01f371f033e0df75d80127643605df7830f Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 15 Nov 2018 00:13:21 +0100 Subject: [PATCH 30/69] Deal with SSL verify error the wrong way. This patch adds an option --no-ssl-cert-verification that allows bypassing OpenSSL server certificate verification. It's hopefully a temporary measure that we set up in order to make progress when confronted to: SSL verify error: 20 X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY The real solution is of course to install the SSL certificates at a place where pgloader will look for them, which defaults to ~/.postgresql/postgresql.crt at the moment. It's not clear what the story is with the defaults from /etc/ssl, or how to make things happen in a better way. See #648, See #679, See #768, See #748, See #775. --- src/main.lisp | 14 ++++++++++++- src/pgsql/connection.lisp | 43 +++++++++++++++++++++++++++++---------- src/utils/threads.lisp | 6 +++++- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/src/main.lisp b/src/main.lisp index a317232..c8e43f7 100644 --- a/src/main.lisp +++ b/src/main.lisp @@ -51,6 +51,10 @@ ("on-error-stop" :type boolean :documentation "Refrain from handling errors properly.") + ("no-ssl-cert-verification" + :type boolean + :documentation "Instruct OpenSSL to bypass verifying certificates.") + (("context" #\C) :type string :documentation "Command Context Variables") (("with") :type string :list t :optional t @@ -197,6 +201,7 @@ client-min-messages log-min-messages summary root-dir self-upgrade with set field cast type encoding before after + no-ssl-cert-verification regress) options @@ -249,11 +254,15 @@ (lisp-implementation-type) (lisp-implementation-version))) - (when help + (when (or help) (usage argv)) (when (or help version) (uiop:quit +os-code-success+)) + (when (null arguments) + (usage argv) + (uiop:quit +os-code-error-usage+)) + (when list-encodings (show-encodings) (uiop:quit +os-code-success+)) @@ -316,6 +325,9 @@ (uiop:native-namestring *log-filename*)) (log-message :log "Data errors in '~a'~%" *root-dir*) + (when no-ssl-cert-verification + (setf cl+ssl:*make-ssl-client-stream-verify-default* nil)) + (cond ((and regress (= 1 (length arguments))) (process-regression-test (first arguments))) diff --git a/src/pgsql/connection.lisp b/src/pgsql/connection.lisp index 63af2af..9896559 100644 --- a/src/pgsql/connection.lisp +++ b/src/pgsql/connection.lisp @@ -118,7 +118,19 @@ (uiop:native-namestring crt-file))) (pomo::*ssl-key-file* (when (and (ssl-enable-p pgconn) (probe-file key-file)) - (uiop:native-namestring key-file)))) + (uiop:native-namestring key-file))) + ;; + ;; It's ok to set :verify-mode to NONE here because + ;; cl+ssl:*make-ssl-client-stream-verify-default* defaults to + ;; :require and takes precedence. + ;; + ;; Only when --no-ssl-cert-verification is passed as a command line + ;; option do we set cl+ssl:*make-ssl-client-stream-verify-default* + ;; to NIL, then allowing the NONE behaviour set here. + ;; + (ssl-context + (CL+SSL:MAKE-CONTEXT :disabled-protocols nil + :verify-mode CL+SSL:+SSL-VERIFY-NONE+))) (flet ((connect (pgconn username) (handler-case ;; in some cases (client_min_messages set to debug5 @@ -128,20 +140,29 @@ #'(lambda (w) (log-message :warning "~a" w) (muffle-warning)))) - (pomo:connect (db-name pgconn) - (or username (db-user pgconn)) - (db-pass pgconn) - (let ((host (db-host pgconn))) - (if (and (consp host) (eq :unix (car host))) - :unix - host)) - :port (db-port pgconn) - :use-ssl (or (pgconn-use-ssl pgconn) :no))) + (CL+SSL:WITH-GLOBAL-CONTEXT (ssl-context :auto-free-p t) + (pomo:connect (db-name pgconn) + (or username (db-user pgconn)) + (db-pass pgconn) + (let ((host (db-host pgconn))) + (if (and (consp host) (eq :unix (car host))) + :unix + host)) + :port (db-port pgconn) + :use-ssl (or (pgconn-use-ssl pgconn) :no)))) + ((or too-many-connections configuration-limit-exceeded) (e) (log-message :error "Failed to connect to ~a: ~a; will try again in ~fs" pgconn e *retry-connect-delay*) - (sleep *retry-connect-delay*))))) + (sleep *retry-connect-delay*)) + + (CL+SSL:SSL-ERROR-VERIFY (e) + (log-message :error + "Connecting to PostgreSQL ~a: ~a" + (db-host pgconn) e) + (log-message :log "You may try --no-ssl-cert-verification") + (error e))))) (loop :while (null (conn-handle pgconn)) :repeat *retry-connect-times* :do (setf (conn-handle pgconn) (connect pgconn username)))) diff --git a/src/utils/threads.lisp b/src/utils/threads.lisp index a2776ce..2581579 100644 --- a/src/utils/threads.lisp +++ b/src/utils/threads.lisp @@ -28,6 +28,10 @@ ;; bindings updates for libs ;; CFFI is used by the SQLite lib (cffi:*default-foreign-encoding* - . ,cffi:*default-foreign-encoding*)))) + . ,cffi:*default-foreign-encoding*) + + ;; CL+SSL can be picky about verifying certs + (cl+ssl:*make-ssl-client-stream-verify-default* + . ,cl+ssl:*make-ssl-client-stream-verify-default*)))) "Wrapper around lparallel:make-kernel that sets our usual bindings." (lp:make-kernel worker-count :bindings bindings)) From e291c502ba97358e15d27fbd1635ba19c8f495f5 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 15 Nov 2018 23:59:51 +0100 Subject: [PATCH 31/69] Install a call to cl+ssl:reload at image startup time, again. Testing shows that it's not just debian which needs it, it's always necessary. Just re-add our tweak now. See #866, see #816, see #807, #794. --- src/hooks.lisp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hooks.lisp b/src/hooks.lisp index f34d405..bb27ea2 100644 --- a/src/hooks.lisp +++ b/src/hooks.lisp @@ -30,10 +30,8 @@ ;; handles some context and things around loading with CFFI. (cl+ssl:reload))) -#| #+ccl (push #'open-foreign-libs *lisp-startup-functions*) #+sbcl (push #'open-foreign-libs sb-ext:*init-hooks*) -|# #+ccl (push #'close-foreign-libs *save-exit-functions*) #+sbcl (push #'close-foreign-libs sb-ext:*save-hooks*) From 8b1acbae877b1e5a2c5274ff8937d2ffaef8b922 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 16 Nov 2018 00:03:31 +0100 Subject: [PATCH 32/69] Make sure the image knows how to print circular data structures. Our catalogs representation is designed to be circular, which helps navigating the graph from anywhere when processing it. This means that we need to have *print-circle* set to t in the pgloader image, otherwise we might run into Control stack exhausted when trying to print out debug information... Fixes #865, #800, #810, #859, #824. --- src/hooks.lisp | 7 +++++++ src/utils/threads.lisp | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/hooks.lisp b/src/hooks.lisp index bb27ea2..62c878b 100644 --- a/src/hooks.lisp +++ b/src/hooks.lisp @@ -12,6 +12,13 @@ ;; So that we can #+pgloader-image some code away, see main.lisp (push :pgloader-image *features*) +;;; +;;; We need to support *print-circle* for the debug traces of the catalogs, +;;; and while at it let's enforce *print-pretty* too. +;;; +(setf *print-circle* t *print-pretty* t) + + (in-package #:cl-user) (defun close-foreign-libs () diff --git a/src/utils/threads.lisp b/src/utils/threads.lisp index 2581579..bfa6876 100644 --- a/src/utils/threads.lisp +++ b/src/utils/threads.lisp @@ -7,7 +7,9 @@ (defun make-kernel (worker-count &key (bindings - `((*monitoring-queue* . ,*monitoring-queue*) + `((*print-circle* . ,*print-circle*) + (*print-pretty* . ,*print-pretty*) + (*monitoring-queue* . ,*monitoring-queue*) (*copy-batch-rows* . ,*copy-batch-rows*) (*copy-batch-size* . ,*copy-batch-size*) (*rows-per-range* . ,*rows-per-range*) From 1fd0576ace1de169a6bdc01f017519e9ed796cde Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 16 Nov 2018 00:08:27 +0100 Subject: [PATCH 33/69] Fix Citus support related debug print instructions. --- src/utils/citus.lisp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/utils/citus.lisp b/src/utils/citus.lisp index c1b73b8..65743af 100644 --- a/src/utils/citus.lisp +++ b/src/utils/citus.lisp @@ -76,14 +76,16 @@ (setf (citus-distributed-rule-table rule) (citus-find-table catalog table))))) -(defmethod print-object ((table citus-reference-rule) stream) - (print-unreadable-object (table stream :type t :identity t) - (with-slots (table) table - (format stream "distribute ~a as reference" (format-table-name table))))) +(defmethod print-object ((rule citus-reference-rule) stream) + (print-unreadable-object (rule stream :type t :identity t) + (with-slots (table) rule + (format stream + "distribute ~a as reference" + (format-table-name table))))) -(defmethod print-object ((table citus-distributed-rule) stream) - (print-unreadable-object (table stream :type t :identity t) - (with-slots (table using from) table +(defmethod print-object ((rule citus-distributed-rule) stream) + (print-unreadable-object (rule stream :type t :identity t) + (with-slots (table using from) rule (format stream "distribute ~a :using ~a~@[ :from ~{~a~^, ~}~]" (format-table-name table) @@ -231,9 +233,9 @@ (defun add-column-to-pkey (table column-name) "Add COLUMN in the first position of the TABLE's primary key index." (let* ((index (find-if #'index-primary (table-index-list table))) - (idxcol (find column-name (index-columns index) :test #'string=))) - (assert (not (null index))) - (unless idxcol + (idxcol (when index + (find column-name (index-columns index) :test #'string=)))) + (when (and index (null idxcol)) ;; add a new column (push column-name (index-columns index)) ;; now remove origin schema sql and condef, we need to redo them From f07ac6126966b570c912fc326678d1acdb2f8763 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sun, 18 Nov 2018 17:46:41 +0100 Subject: [PATCH 34/69] Fix default/serial handling of pgsql as a source. In the recent patch that added support for Redshift "identity" columns, we broke support for PostgreSQL sequences. Unbreak that. --- src/sources/pgsql/pgsql-cast-rules.lisp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sources/pgsql/pgsql-cast-rules.lisp b/src/sources/pgsql/pgsql-cast-rules.lisp index 6c0690e..ea1ef04 100644 --- a/src/sources/pgsql/pgsql-cast-rules.lisp +++ b/src/sources/pgsql/pgsql-cast-rules.lisp @@ -36,9 +36,10 @@ pgloader.catalog::extra) field (let* ((ctype (pgsql-column-ctype field)) - (extra (when (and (stringp (column-default field)) - (search "identity" (column-default field))) - :auto-increment)) + (extra (or pgloader.catalog::extra + (when (and (stringp (column-default field)) + (search "identity" (column-default field))) + :auto-increment))) (pgcol (apply-casting-rules nil pgloader.catalog::name pgloader.catalog::type-name From aa8ae159e2c5714bd1913d8dc381a74b460719a9 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sun, 18 Nov 2018 18:21:51 +0100 Subject: [PATCH 35/69] Improve error handling when applying Citus distribution rules. Make it so that we generate a proper error message to the user when failing to figure out the PATH to the distribution key, rather than failing with an internal error about The value NIL is not of type PGLOADER.CATALOG:TABLE. --- src/load/migrate-database.lisp | 13 ++++--- src/package.lisp | 3 +- src/utils/citus.lisp | 65 +++++++++++++++++++++++----------- 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index 3037571..db7d096 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -347,10 +347,15 @@ ;; apply catalog level transformations to support the database migration ;; that's CAST rules, index WHERE clause rewriting and ALTER commands - (process-catalog copy catalog - :alter-table alter-table - :alter-schema alter-schema - :distribute distribute) + (handler-case + (process-catalog copy catalog + :alter-table alter-table + :alter-schema alter-schema + :distribute distribute) + + (citus-rule-is-missing-from-list (e) + (log-message :fatal "~a" e) + (return-from copy-database))) ;; if asked, first drop/create the tables on the PostgreSQL side (handler-case diff --git a/src/package.lisp b/src/package.lisp index 20820ab..307eb3d 100644 --- a/src/package.lisp +++ b/src/package.lisp @@ -298,7 +298,8 @@ #:pgloader.monitor) (:export #:citus-distribute-schema #:citus-format-sql-select - #:citus-backfill-table-p)) + #:citus-backfill-table-p + #:citus-rule-is-missing-from-list)) (defpackage #:pgloader.utils (:use #:cl diff --git a/src/utils/citus.lisp b/src/utils/citus.lisp index 65743af..0bcf329 100644 --- a/src/utils/citus.lisp +++ b/src/utils/citus.lisp @@ -172,6 +172,18 @@ ;;; itself to the table-citus-rule slot so that we later know to generate a ;;; proper SELECT query that includes the backfilling. ;;; +(define-condition citus-rule-is-missing-from-list (error) + ((rule :initarg :rule :accessor citus-rule)) + (:report + (lambda (err stream) + (let ((*print-circle* nil)) + (format stream + "Failed to add column ~s to table ~a for lack of a FROM clause in the distribute rule:~% distribute ~a using ~a from ?" + (column-name (citus-distributed-rule-using (citus-rule err))) + (format-table-name (citus-distributed-rule-table (citus-rule err))) + (format-table-name (citus-distributed-rule-table (citus-rule err))) + (column-name (citus-distributed-rule-using (citus-rule err)))))))) + (defgeneric apply-citus-rule (rule) (:documentation "Apply a Citus distribution RULE to given TABLE.")) @@ -206,28 +218,41 @@ ;; it up in the last entry of the FROM rule's list. (let* ((last-from-rule (car (last (citus-distributed-rule-from rule)))) (column-definition - (find (column-name (citus-distributed-rule-using rule)) - (table-field-list last-from-rule) - :test #'string= - :key #'column-name)) + (when last-from-rule + (find (column-name (citus-distributed-rule-using rule)) + (table-field-list last-from-rule) + :test #'string= + :key #'column-name))) (new-column - (make-column :name (column-name column-definition) - :type-name (column-type-name column-definition) - :nullable (column-nullable column-definition) - :transform (column-transform column-definition)))) - ;; - ;; Here also we need to add the new column to the PKEY definition, - ;; in first position. - ;; - (add-column-to-pkey table (column-name new-column)) + (when column-definition + (make-column :name (column-name column-definition) + :type-name (column-type-name column-definition) + :nullable (column-nullable column-definition) + :transform (column-transform column-definition))))) - ;; - ;; We need to backfill the distribution key in the data, which - ;; we're implementing with a JOIN when we SELECT from the source - ;; table. We add the new field here. - ;; - (push new-column (table-field-list table)) - (push new-column (table-column-list table)))))) + (if column-definition + (progn + ;; + ;; Here also we need to add the new column to the PKEY + ;; definition, in first position. + ;; + (add-column-to-pkey table (column-name new-column)) + + ;; + ;; We need to backfill the distribution key in the data, + ;; which we're implementing with a JOIN when we SELECT from + ;; the source table. We add the new field here. + ;; + (push new-column (table-field-list table)) + (push new-column (table-column-list table))) + + ;; + ;; We don't have any table-field-list in the citus rule, + ;; meaning that the distribute ... using ... clause is lacking + ;; the FROM part, and we need it. + ;; + (error + (make-condition 'citus-rule-is-missing-from-list :rule rule))))))) (defun add-column-to-pkey (table column-name) From 3f2f10eef1f1899ec2b1d93cd9d77353b14cf6f0 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 19 Nov 2018 19:33:37 +0100 Subject: [PATCH 36/69] Finish implementation of CAST rules for PostgreSQL source databases. Add a link to the table from the internal catalogs for columns so that we can match table-source-name in cast rules when migrating from PostgreSQL. --- src/pgsql/pgsql-schema.lisp | 3 ++- src/sources/pgsql/pgsql-cast-rules.lisp | 5 +++-- src/utils/catalog.lisp | 2 +- test/citus/company.load | 2 ++ 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pgsql/pgsql-schema.lisp b/src/pgsql/pgsql-schema.lisp index 0b98459..b2d7a27 100644 --- a/src/pgsql/pgsql-schema.lisp +++ b/src/pgsql/pgsql-schema.lisp @@ -185,7 +185,8 @@ :do (let* ((schema (maybe-add-schema catalog schema-name)) (table (maybe-add-table schema table-name :oid table-oid)) - (field (make-column :name name + (field (make-column :table table + :name name :type-name type :type-mod typmod :nullable (not notnull) diff --git a/src/sources/pgsql/pgsql-cast-rules.lisp b/src/sources/pgsql/pgsql-cast-rules.lisp index ea1ef04..d7b003c 100644 --- a/src/sources/pgsql/pgsql-cast-rules.lisp +++ b/src/sources/pgsql/pgsql-cast-rules.lisp @@ -26,7 +26,8 @@ (defmethod cast ((field column) &key &allow-other-keys) "Return the PostgreSQL type definition from the given PostgreSQL column definition" - (with-slots (pgloader.catalog::name + (with-slots (pgloader.catalog::table + pgloader.catalog::name pgloader.catalog::type-name pgloader.catalog::type-mod pgloader.catalog::nullable @@ -40,7 +41,7 @@ (when (and (stringp (column-default field)) (search "identity" (column-default field))) :auto-increment))) - (pgcol (apply-casting-rules nil + (pgcol (apply-casting-rules (table-source-name pgloader.catalog::table) pgloader.catalog::name pgloader.catalog::type-name ctype diff --git a/src/utils/catalog.lisp b/src/utils/catalog.lisp index baca81a..8b0cd62 100644 --- a/src/utils/catalog.lisp +++ b/src/utils/catalog.lisp @@ -71,7 +71,7 @@ ;;; produce, so that we know how to CREATE TABLEs in PostgreSQL whatever the ;;; source is. ;;; -(defstruct column name type-name type-mod nullable default comment +(defstruct column table name type-name type-mod nullable default comment transform extra (transform-default t)) ;;; diff --git a/test/citus/company.load b/test/citus/company.load index ef4af21..c2f7ad0 100644 --- a/test/citus/company.load +++ b/test/citus/company.load @@ -4,6 +4,8 @@ load database with include drop, reset no sequences + cast column impressions.seen_at to "timestamp with time zone" + distribute companies using id -- distribute campaigns using company_id -- distribute ads using company_id from campaigns From 1c18b41cd72300abf12d67abfd411f9edf2bcad9 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 20 Nov 2018 22:59:43 +0100 Subject: [PATCH 37/69] Implement a new way of building pgloader: make save. This time we directly call into the save-lisp-and-die feature of the implementation. As pgloader only supports SBCL and CCL at the time being, doing things without an abstraction layer is easy enough. This needs more testing and a special version for the bundle case too. One step at a time, etc. --- Makefile | 3 +++ src/save.lisp | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 src/save.lisp diff --git a/Makefile b/Makefile index 06c381a..5708b8d 100644 --- a/Makefile +++ b/Makefile @@ -164,6 +164,9 @@ pgloader-standalone: test: $(PGLOADER) $(MAKE) PGLOADER=$(realpath $(PGLOADER)) CL=$(CL) -C test regress +save: ./src/save.lisp $(LISP_SRC) + sbcl --no-userinit --load ./src/save.lisp + clean-bundle: rm -rf $(BUNDLEDIR) rm -rf $(BUNDLETESTD)/$(BUNDLENAME)/* diff --git a/src/save.lisp b/src/save.lisp new file mode 100644 index 0000000..3da0967 --- /dev/null +++ b/src/save.lisp @@ -0,0 +1,70 @@ +;;; +;;; Create a build/bin/pgloader executable from the source code, using +;;; Quicklisp to load pgloader and its dependencies. +;;; + +(in-package #:cl-user) + +;; ccl provides an implementation of getenv already. +#+sbcl +(defun getenv (name &optional default) + "Return the current value for the environment variable NAME, or default + when unset." + (or (sb-ext:posix-getenv name) default)) + +(require :asdf) ; should work in SBCL and CCL + +(defvar *quicklisp.lisp* "http://beta.quicklisp.org/quicklisp.lisp") + +(let* ((cwd (uiop:getcwd)) + (build-dir (uiop:merge-pathnames* "build/" cwd)) + (ql.lisp (uiop:merge-pathnames* "quicklisp.lisp" build-dir)) + (qldir (uiop:merge-pathnames* "quicklisp/" build-dir)) + (qlsetup (uiop:merge-pathnames* "setup.lisp" qldir))) + ;; + ;; We might have to install Quicklisp in build/quicklisp + ;; + (unless (probe-file qlsetup) + (format t "File ~a is not found, installing Quicklisp from ~a~%" + qlsetup *quicklisp.lisp*) + (uiop:run-program (format nil "curl -o ~a ~a" ql.lisp *quicklisp.lisp*)) + (load ql.lisp) + (let* ((quickstart (find-package "QUICKLISP-QUICKSTART")) + (ql-install (find-symbol "INSTALL" quickstart))) + (funcall ql-install :path qldir :proxy (getenv "http_proxy")))) + + ;; + ;; Now that we have Quicklisp, load it and push our copy of pgloader in + ;; ql:*local-project-directories* where Quicklisp will find it. + ;; + (format t "Loading file ~a~%" qlsetup) + (load qlsetup) + + (let* ((ql (find-package "QL")) + (lpd (find-symbol "*LOCAL-PROJECT-DIRECTORIES*" ql)) + (quickload (find-symbol "QUICKLOAD" ql))) + (push cwd (symbol-value lpd)) + + ;; + ;; And finally load pgloader and its image-based hooks + ;; + (format t "Loading system pgloader~%") + (funcall quickload :pgloader) + (load (asdf:system-relative-pathname :pgloader "src/hooks.lisp")))) + +(defun pgloader-image-main () + (let ((argv #+sbcl sb-ext:*posix-argv* + #+ccl ccl:*command-line-argument-list*)) + (pgloader::main argv))) + +(let ((image-filename "/Users/dim/dev/pgloader/build/bin/pgloader")) + #+ccl + (ccl:save-application image-filename + :toplevel-function #'cl-user::pgloader-image-main + :prepend-kernel t) + #+sbcl + (sb-ext:save-lisp-and-die image-filename + :toplevel #'cl-user::pgloader-image-main + :executable t + :save-runtime-options t + :compression t)) From 743769d750d933eef70f4f148a3b7337630f4dc6 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 21 Nov 2018 10:38:19 +0100 Subject: [PATCH 38/69] Improve handling of errors when fetching the source catalogs. We might have MS SQL failures at this stage, or even Redshift or other PostgreSQL variants failing to execute our catalog queries. Handle conditions by cleanly logging them and returning from copy-database without doing anything. That's the best we can do here. Fixes #605, fixes #757. --- src/load/migrate-database.lisp | 39 ++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index db7d096..2dee25b 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -319,19 +319,32 @@ (copy-kernel (make-kernel worker-count)) (copy-channel (let ((lp:*kernel* copy-kernel)) (lp:make-channel))) - (catalog (fetch-metadata - copy - (make-catalog - :name (typecase (source-db copy) - (db-connection (db-name (source-db copy))) - (fd-connection (pathname-name - (fd-path (source-db copy)))))) - :materialize-views materialize-views - :create-indexes create-indexes - :foreign-keys foreign-keys - :only-tables only-tables - :including including - :excluding excluding)) + (catalog (handler-case + (fetch-metadata + copy + (make-catalog + :name (typecase (source-db copy) + (db-connection + (db-name (source-db copy))) + (fd-connection + (pathname-name + (fd-path (source-db copy)))))) + :materialize-views materialize-views + :create-indexes create-indexes + :foreign-keys foreign-keys + :only-tables only-tables + :including including + :excluding excluding) + (mssql::mssql-error (e) + (log-message :error "MSSQL ERROR: ~a" e) + (log-message :log "You might need to review the FreeTDS protocol version in your freetds.conf file, see http://www.freetds.org/userguide/choosingtdsprotocol.htm") + (return-from copy-database)) + (condition (e) + (log-message :error + "ERROR ~a: ~a" + (conn-type (source-db copy)) + e) + (return-from copy-database)))) pkeys (writers-count (make-hash-table :size (count-tables catalog))) (max-indexes (when create-indexes From 4ab26e5387066199bd7ab7fe4f7ea18dbdccbc84 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 21 Nov 2018 17:31:11 +0100 Subject: [PATCH 39/69] Handle other conditions in process-catalogs. It might be that some random condition is signaled during process-catalogs, causing the errors reported so far and that I can't reproduce. Let's add some handler-case protection to have more clues about what could be happening. See #865, #800, #810, #859, #824. --- src/load/migrate-database.lisp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/load/migrate-database.lisp b/src/load/migrate-database.lisp index 2dee25b..689894d 100644 --- a/src/load/migrate-database.lisp +++ b/src/load/migrate-database.lisp @@ -341,7 +341,7 @@ (return-from copy-database)) (condition (e) (log-message :error - "ERROR ~a: ~a" + "~a: ~a" (conn-type (source-db copy)) e) (return-from copy-database)))) @@ -368,6 +368,10 @@ (citus-rule-is-missing-from-list (e) (log-message :fatal "~a" e) + (return-from copy-database)) + + (condition (e) + (log-message :fatal "Failed to process catalogs: ~a" e) (return-from copy-database))) ;; if asked, first drop/create the tables on the PostgreSQL side From 18bcf109037f434859d9f2dc80bb44f9b0a8eeab Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 21 Nov 2018 18:17:34 +0100 Subject: [PATCH 40/69] Blind fix for a strange use-case. A user reported a case where pgloader fails to find the table an index has been created on in pgloader catalogs. That's a weird case. For now, just issue a warning about the situation and skip the index. --- src/sources/mssql/mssql-schema.lisp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sources/mssql/mssql-schema.lisp b/src/sources/mssql/mssql-schema.lisp index 97d3d36..89be6ff 100644 --- a/src/sources/mssql/mssql-schema.lisp +++ b/src/sources/mssql/mssql-schema.lisp @@ -144,8 +144,14 @@ :columns nil :filter filter)) (index - (maybe-add-index table index-name pg-index :key #'index-name))) - (add-column index colname)) + (when table + (maybe-add-index table index-name pg-index :key #'index-name)))) + (unless table + (log-message :warning + "Failed to find table ~s in schema ~s for index ~s, skipping the index" + table-name schema-name index-name)) + (when index + (add-column index colname))) :finally (return catalog))) (defun list-all-fkeys (catalog &key including excluding) From 6e325f67e0d3dfacae36c1c81b9478ec6eaebb98 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 21 Nov 2018 21:44:56 +0100 Subject: [PATCH 41/69] Implement the save.lisp idea for the bundle. This should make it easier to build pgloader with CCL rather than SBCL, all from the bundle distribution, and also easier to support windows. In passing, add a new file in the bundle distribution: version.sexp should contain a CL string containing the pgloader version string. --- Makefile | 10 +++++++--- bundle/Makefile | 3 +++ bundle/save.lisp | 47 +++++++++++++++++++++++++++++++++++++++++++++++ src/save.lisp | 8 ++++++-- 4 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 bundle/save.lisp diff --git a/Makefile b/Makefile index 5708b8d..83c523a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # pgloader build tool APP_NAME = pgloader -VERSION = 3.5.2 +VERSION = 3.6.0 # use either sbcl or ccl CL = sbcl @@ -24,7 +24,7 @@ QLDIR = $(BUILDDIR)/quicklisp MANIFEST = $(BUILDDIR)/manifest.ql LATEST = $(BUILDDIR)/pgloader-latest.tgz -BUNDLEDIST = 2018-04-30 +BUNDLEDIST = 2018-10-18 BUNDLENAME = pgloader-bundle-$(VERSION) BUNDLEDIR = $(BUILDDIR)/bundle/$(BUNDLENAME) BUNDLE = $(BUILDDIR)/$(BUNDLENAME).tgz @@ -182,8 +182,12 @@ $(BUNDLEDIR): --eval '(defvar *ql-dist* "$(BUNDLEDIST)")' \ --load bundle/ql.lisp -$(BUNDLE): $(BUNDLEDIR) +$(BUNDLEDIR)/version.sexp: $(BUNDLEDIR) + echo "\"$(VERSION)\"" > $@ + +$(BUNDLE): $(BUNDLEDIR) $(BUNDLEDIR)/version.sexp cp bundle/README.md $(BUNDLEDIR) + cp bundle/save.lisp $(BUNDLEDIR) sed -e s/%VERSION%/$(VERSION)/ < bundle/Makefile > $(BUNDLEDIR)/Makefile git archive --format=tar --prefix=pgloader-$(VERSION)/ master \ | tar -C $(BUNDLEDIR)/local-projects/ -xf - diff --git a/bundle/Makefile b/bundle/Makefile index 246438f..9102bd4 100644 --- a/bundle/Makefile +++ b/bundle/Makefile @@ -61,4 +61,7 @@ $(PGLOADER): $(BUILDAPP) test: $(PGLOADER) $(MAKE) PGLOADER=$(realpath $(PGLOADER)) -C $(SRCDIR)/test regress +save: + sbcl --no-userinit --load ./save.lisp + check: test ; diff --git a/bundle/save.lisp b/bundle/save.lisp new file mode 100644 index 0000000..d955b6c --- /dev/null +++ b/bundle/save.lisp @@ -0,0 +1,47 @@ +;;; +;;; Create a build/bin/pgloader executable from the source code, using +;;; Quicklisp to load pgloader and its dependencies. +;;; + +(in-package #:cl-user) + +(require :asdf) ; should work in SBCL and CCL + +(let* ((cwd (uiop:getcwd)) + (bundle.lisp (uiop:merge-pathnames* "bundle.lisp" cwd)) + (version-file (uiop:merge-pathnames* "version.sexp" cwd)) + (version-string (uiop:read-file-form version-file)) + (asdf:*central-registry* (list cwd))) + + (format t "Loading bundle.lisp~%") + (load bundle.lisp) + + (format t "Loading system pgloader ~a~%" version-string) + (asdf:load-system :pgloader :verbose nil) + (load (asdf:system-relative-pathname :pgloader "src/hooks.lisp")) + + (let* ((pgl (find-package "PGLOADER")) + (version-symbol (find-symbol "*VERSION-STRING*" pgl))) + (setf (symbol-value version-symbol) version-string))) + +(defun pgloader-image-main () + (let ((argv #+sbcl sb-ext:*posix-argv* + #+ccl ccl:*command-line-argument-list*)) + (pgloader::main argv))) + +(let* ((cwd (uiop:getcwd)) + (bin-dir (uiop:merge-pathnames* "bin/" cwd)) + (bin-filename (uiop:merge-pathnames* "pgloader" bin-dir))) + + (ensure-directories-exist bin-dir) + + #+ccl + (ccl:save-application bin-filename + :toplevel-function #'cl-user::pgloader-image-main + :prepend-kernel t) + #+sbcl + (sb-ext:save-lisp-and-die bin-filename + :toplevel #'cl-user::pgloader-image-main + :executable t + :save-runtime-options t + :compression t)) diff --git a/src/save.lisp b/src/save.lisp index 3da0967..43b0de2 100644 --- a/src/save.lisp +++ b/src/save.lisp @@ -27,7 +27,9 @@ (unless (probe-file qlsetup) (format t "File ~a is not found, installing Quicklisp from ~a~%" qlsetup *quicklisp.lisp*) - (uiop:run-program (format nil "curl -o ~a ~a" ql.lisp *quicklisp.lisp*)) + (let ((command (format nil "curl -o ~a ~a" ql.lisp *quicklisp.lisp*))) + (format t "Running command: ~a~%" command) + (uiop:run-program command)) (load ql.lisp) (let* ((quickstart (find-package "QUICKLISP-QUICKSTART")) (ql-install (find-symbol "INSTALL" quickstart))) @@ -57,7 +59,9 @@ #+ccl ccl:*command-line-argument-list*)) (pgloader::main argv))) -(let ((image-filename "/Users/dim/dev/pgloader/build/bin/pgloader")) +(let* ((cwd (uiop:getcwd)) + (build-dir (uiop:merge-pathnames* "build/bin/" cwd)) + (image-filename (uiop:merge-pathnames* "pgloader" build-dir))) #+ccl (ccl:save-application image-filename :toplevel-function #'cl-user::pgloader-image-main From 801d8a6e0939579b4a00de5011712af4d5738cd0 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 23 Nov 2018 10:43:58 +0100 Subject: [PATCH 42/69] Add support for MS SQL time data type. As for the other datetime types we have to use CONVERT at the SQL level in order to get a format that PostgreSQL understands. This time the magic number for it is 114. --- src/monkey/mssql.lisp | 2 +- src/sources/mssql/mssql-schema.lisp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/monkey/mssql.lisp b/src/monkey/mssql.lisp index ce2abf6..5790b17 100644 --- a/src/monkey/mssql.lisp +++ b/src/monkey/mssql.lisp @@ -95,7 +95,7 @@ (:syb-int8 (mem-ref data :int8)) (:syb-real (mem-ref data :float)) (:syb-flt8 (mem-ref data :double)) - ((:syb-datetime :syb-datetime4 :syb-msdate) + ((:syb-datetime :syb-datetime4 :syb-msdate :syb-mstime) (with-foreign-pointer (%buf +numeric-buf-sz+) (let ((count (%dbconvert %dbproc diff --git a/src/sources/mssql/mssql-schema.lisp b/src/sources/mssql/mssql-schema.lisp index 89be6ff..c743647 100644 --- a/src/sources/mssql/mssql-schema.lisp +++ b/src/sources/mssql/mssql-schema.lisp @@ -201,6 +201,7 @@ Mostly we just use the name, and make try to avoid parsing dates." (case (intern (string-upcase type) "KEYWORD") + (:time (format nil "convert(varchar, [~a], 114)" name)) (:datetime (format nil "convert(varchar, [~a], 126)" name)) (:smalldatetime (format nil "convert(varchar, [~a], 126)" name)) (:date (format nil "convert(varchar, [~a], 126)" name)) From ab2cadff24f58c933b2c6afd29604c5e938eb8c7 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 30 Nov 2018 15:38:31 +0100 Subject: [PATCH 43/69] Simplify the regular expresion parsing the PostgreSQL version string. The debian/Ubuntu packaging would defeat the quite simple regexp parsing PostgreSQL version string that we have in pgloader. To make it more robust, make it more open to unforeseen strings. See #800, see #810. --- src/pgsql/connection.lisp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pgsql/connection.lisp b/src/pgsql/connection.lisp index 9896559..404b451 100644 --- a/src/pgsql/connection.lisp +++ b/src/pgsql/connection.lisp @@ -410,10 +410,11 @@ ;;; ;;; PostgreSQL 8.0.2 on i686-pc-linux-gnu, compiled by GCC gcc (GCC) 3.4.2 20041017 (Red Hat 3.4.2-6.fc3), Redshift 1.0.2058 ;;; PostgreSQL 10.1 on x86_64-apple-darwin14.5.0, compiled by Apple LLVM version 7.0.0 (clang-700.1.76), 64-bit +;;; PostgreSQL 10.6 (Ubuntu 10.6-1.pgdg14.04+1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 4.8.4-2ubuntu1~14.04.4) 4.8.4, 64-bit (defun parse-postgresql-version-string (version-string) "Parse PostgreSQL select version() output." (cl-ppcre:register-groups-bind (full-version maybe-variant) - ("PostgreSQL ([0-9.]+) on .*, [^,]+, (.*)" version-string) + ("PostgreSQL ([0-9.]+) [^,]+, [^,]+, (.*)" version-string) (let* ((version-dots (split-sequence:split-sequence #\. full-version)) (major-version (if (= 3 (length version-dots)) (format nil "~a.~a" From a939d20dff19aa3fbfb0ea3d4360f43dbd6537ae Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sat, 1 Dec 2018 21:27:26 +0100 Subject: [PATCH 44/69] Unquote names when searching for an index column name in its table. If the source database is using a keyword (such as "order") as a column name, then pgloader is going to quote this column name in its internal catalogs. In that case, unquote the column in the pgloader catalogs when matching it against the unquoted column name we have in the index definition. Fixes #872. --- src/pgsql/pgsql-ddl.lisp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pgsql/pgsql-ddl.lisp b/src/pgsql/pgsql-ddl.lisp index 662bc95..008fe7d 100644 --- a/src/pgsql/pgsql-ddl.lisp +++ b/src/pgsql/pgsql-ddl.lisp @@ -266,7 +266,9 @@ :collect (column-type-name (find idx-col tbl-cols :test #'string-equal - :key #'column-name)))) + :key (lambda (col) + (ensure-unquoted + (column-name col))))))) (nobtree (catalog-types-without-btree (schema-catalog (table-schema (index-table index)))))) (let* ((idx-type (first idx-types)) From af2995b91804aa52e90f3845b8a46cd46ac66504 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sun, 2 Dec 2018 00:17:26 +0100 Subject: [PATCH 45/69] Apply quoting rules to SQLite index column names. The previous fix was wrong for missing the point: rather than unquote column names in the table definition when matching the column names in the index definition, we should in the first place have quoted the index column names when needed. Fixes #872 for real this time. --- src/pgsql/pgsql-ddl.lisp | 4 +--- src/sources/sqlite/sqlite-schema.lisp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pgsql/pgsql-ddl.lisp b/src/pgsql/pgsql-ddl.lisp index 008fe7d..662bc95 100644 --- a/src/pgsql/pgsql-ddl.lisp +++ b/src/pgsql/pgsql-ddl.lisp @@ -266,9 +266,7 @@ :collect (column-type-name (find idx-col tbl-cols :test #'string-equal - :key (lambda (col) - (ensure-unquoted - (column-name col))))))) + :key #'column-name)))) (nobtree (catalog-types-without-btree (schema-catalog (table-schema (index-table index)))))) (let* ((idx-type (first idx-types)) diff --git a/src/sources/sqlite/sqlite-schema.lisp b/src/sources/sqlite/sqlite-schema.lisp index 7174fea..17f05ba 100644 --- a/src/sources/sqlite/sqlite-schema.lisp +++ b/src/sources/sqlite/sqlite-schema.lisp @@ -150,7 +150,7 @@ "Return the list of columns in INDEX-NAME." (let ((sql (format nil (sql "/sqlite/list-index-cols.sql") index-name))) (loop :for (index-pos table-pos col-name) :in (sqlite:execute-to-list db sql) - :collect col-name))) + :collect (apply-identifier-case col-name)))) (defun list-indexes (table &optional (db *sqlite-db*)) "Return the list of indexes attached to TABLE." From 56d24de67a3a31a92f514ff1db5eb686ec2d0a63 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Tue, 11 Dec 2018 14:25:08 +0900 Subject: [PATCH 46/69] Update documentation with new features. We have a lot of new features to document. This is a first patch about that, some more work is to be done. That said, it's better than nothing already. --- docs/index.rst | 4 + docs/intro.rst | 8 + docs/pgloader.rst | 28 +++ docs/ref/mysql.rst | 15 +- docs/ref/pgsql-citus-target.rst | 77 ++++++ docs/ref/pgsql-redshift-source.rst | 12 + docs/ref/pgsql-redshift-target.rst | 10 + docs/ref/pgsql.rst | 371 +++++++++++++++++++++++++++++ 8 files changed, 521 insertions(+), 4 deletions(-) create mode 100644 docs/ref/pgsql-citus-target.rst create mode 100644 docs/ref/pgsql-redshift-source.rst create mode 100644 docs/ref/pgsql-redshift-target.rst create mode 100644 docs/ref/pgsql.rst diff --git a/docs/index.rst b/docs/index.rst index d69915e..3fb2f9a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,10 @@ Welcome to pgloader's documentation! ref/mysql ref/sqlite ref/mssql + ref/pgsql + ref/pgsql-citus-target + ref/pgsql-redshift-source + ref/pgsql-redshift-target ref/transforms bugreport diff --git a/docs/intro.rst b/docs/intro.rst index 0dc75e2..2a098d9 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -17,6 +17,14 @@ pgloader knows how to read data from different kind of sources: * SQLite * MySQL * MS SQL Server + * PostgreSQL + * Redshift + +pgloader knows how to target different products using the PostgresQL Protocol: + + * PostgreSQL + * `Citus `_ + * Redshift The level of automation provided by pgloader depends on the data source type. In the case of CSV and Fixed Format files, a full description of the diff --git a/docs/pgloader.rst b/docs/pgloader.rst index 4a8cade..00fa186 100644 --- a/docs/pgloader.rst +++ b/docs/pgloader.rst @@ -154,6 +154,18 @@ Those options are meant to tweak `pgloader` behavior when loading data. machine code) another version of itself, usually a newer one like a very recent git checkout. + * `--no-ssl-cert-verification` + + Uses the OpenSSL option to accept a locally issued server-side + certificate, avoiding the following error message:: + + SSL verify error: 20 X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY + + The right way to fix the SSL issue is to use a trusted certificate, of + course. Sometimes though it's useful to make progress with the pgloader + setup while the certificate chain of trust is being fixed, maybe by + another team. That's when this option is useful. + Command Line Only Operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -552,6 +564,22 @@ queries from a SQL file. Implements support for PostgreSQL dollar-quoting and the `\i` and `\ir` include facilities as in `psql` batch mode (where they are the same thing). +AFTER CREATE SCHEMA DO +^^^^^^^^^^^^^^^^^^^^^^ + +Same format as *BEFORE LOAD DO*, the dollar-quoted queries found in that +section are executed once the schema has been craeted by pgloader, and +before the data is loaded. It's the right time to ALTER TABLE or do some +custom implementation on-top of what pgloader does, like maybe partitioning. + +AFTER CREATE SCHEMA EXECUTE +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Same behaviour as in the *AFTER CREATE SCHEMA DO* clause. Allows you to read +the SQL queries from a SQL file. Implements support for PostgreSQL +dollar-quoting and the `\i` and `\ir` include facilities as in `psql` batch +mode (where they are the same thing). + Connection String ^^^^^^^^^^^^^^^^^ diff --git a/docs/ref/mysql.rst b/docs/ref/mysql.rst index dec33a9..a55f5d7 100644 --- a/docs/ref/mysql.rst +++ b/docs/ref/mysql.rst @@ -1,10 +1,9 @@ Migrating a MySQL Database to PostgreSQL ======================================== -This command instructs pgloader to load data from a database connection. The -only supported database source is currently *MySQL*, and pgloader supports -dynamically converting the schema of the source database and the indexes -building. +This command instructs pgloader to load data from a database connection. +pgloader supports dynamically converting the schema of the source database +and the indexes building. A default set of casting rules are provided and might be overloaded and appended to by the command. @@ -609,6 +608,14 @@ Date:: to timestamptz drop default using zero-dates-to-null + type datetime with extra on update current timestamp when not null + to timestamptz drop not null drop default + using zero-dates-to-null + + type datetime with extra on update current timestamp + to timestamptz drop default + using zero-dates-to-null + type timestamp when default "0000-00-00 00:00:00" and not null to timestamptz drop not null drop default using zero-dates-to-null diff --git a/docs/ref/pgsql-citus-target.rst b/docs/ref/pgsql-citus-target.rst new file mode 100644 index 0000000..257e081 --- /dev/null +++ b/docs/ref/pgsql-citus-target.rst @@ -0,0 +1,77 @@ +Migrating a PostgreSQL Database to Citus +======================================== + +This command instructs pgloader to load data from a database connection. +Automatic discovery of the schema is supported, including build of the +indexes, primary and foreign keys constraints. A default set of casting +rules are provided and might be overloaded and appended to by the command. + +Automatic distribution column backfilling is supported, either from commands +that specify what is the distribution column in every table, or only in the +main table, then relying on foreign key constraints to discover the other +distribution keys. + +Here's a short example of migrating a database from a PostgreSQL server to +another: + +:: + + load database + from pgsql:///hackathon + into pgsql://localhost:9700/dim + + with include drop, reset no sequences + + cast column impressions.seen_at to "timestamp with time zone" + + distribute companies using id + -- distribute campaigns using company_id + -- distribute ads using company_id from campaigns + -- distribute clicks using company_id from ads, campaigns + -- distribute impressions using company_id from ads, campaigns + ; + +Everything works exactly the same way as when doing a PostgreSQL to +PostgreSQL migration, with the added fonctionality of this new `distribute` +command. + +Distribute Command +^^^^^^^^^^^^^^^^^^ + +The distribute command syntax is as following:: + + distribute using + distribute
using from
[,
, ...] + distribute
as reference table + +When using the distribute command, the following steps are added to pgloader +operations when migrating the schema: + + - if the distribution column does not exist in the table, it is added as + the first column of the table + + - if the distribution column does not exists in the primary key of the + table, it is added as the first column of the primary of the table + + - all the foreign keys that point to the table are added the distribution + key automatically too, including the source tables of the foreign key + constraints + + - once the schema has been created on the target database, pgloader then + issues Citus specific command `create_reference_table() + `_ + and `create_distributed_table() + `_ + to make the tables distributed + +Those operations are done in the schema section of pgloader, before the data +is loaded. When the data is loaded, the newly added columns need to be +backfilled from referenced data. pgloader knows how to do that by generating +a query like the following and importing the result set of such a query +rather than the raw data from the source table. + +Citus Migration: Limitations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The way pgloader implements *reset sequence* does not work with Citus at +this point, so sequences need to be taken care of separately at this point. diff --git a/docs/ref/pgsql-redshift-source.rst b/docs/ref/pgsql-redshift-source.rst new file mode 100644 index 0000000..b69b6d9 --- /dev/null +++ b/docs/ref/pgsql-redshift-source.rst @@ -0,0 +1,12 @@ +Migrating a Redhift Database to PostgreSQL +========================================== + +This command instructs pgloader to load data from a database connection. +Automatic discovery of the schema is supported, including build of the +indexes, primary and foreign keys constraints. A default set of casting +rules are provided and might be overloaded and appended to by the command. + +The command and behavior are the same as when migration from a PostgreSQL +database source. pgloader automatically discovers that it's talking to a +Redshift database by parsing the output of the `SELECT version()` SQL query. + diff --git a/docs/ref/pgsql-redshift-target.rst b/docs/ref/pgsql-redshift-target.rst new file mode 100644 index 0000000..50cc356 --- /dev/null +++ b/docs/ref/pgsql-redshift-target.rst @@ -0,0 +1,10 @@ +Migrating a PostgreSQL Database to Redshift +=========================================== + +This command instructs pgloader to load data from a database connection. +Automatic discovery of the schema is supported, including build of the +indexes, primary and foreign keys constraints. A default set of casting +rules are provided and might be overloaded and appended to by the command. + + +TODO: add details about S3 credentials and bucket configuration. diff --git a/docs/ref/pgsql.rst b/docs/ref/pgsql.rst new file mode 100644 index 0000000..d233ffa --- /dev/null +++ b/docs/ref/pgsql.rst @@ -0,0 +1,371 @@ +Migrating a PostgreSQL Database to PostgreSQL +============================================= + +This command instructs pgloader to load data from a database connection. +Automatic discovery of the schema is supported, including build of the +indexes, primary and foreign keys constraints. A default set of casting +rules are provided and might be overloaded and appended to by the command. + +Here's a short example of migrating a database from a PostgreSQL server to +another: + +:: + + load database + from pgsql://localhost/pgloader + into pgsql://localhost/copy + + including only table names matching 'bits', ~/utilisateur/ in schema 'mysql' + including only table names matching ~/geolocations/ in schema 'public' + ; + +PostgreSQL Database Source Specification: FROM +---------------------------------------------- + +Must be a connection URL pointing to a PostgreSQL database. + +See the `SOURCE CONNECTION STRING` section above for details on how to write +the connection string. + +:: + + pgsql://[user[:password]@][netloc][:port][/dbname][?option=value&...] + + +PostgreSQL Database Migration Options: WITH +------------------------------------------- + +When loading from a `PostgreSQL` database, the following options are +supported, and the default *WITH* clause is: *no truncate*, *create schema*, +*create tables*, *include drop*, *create indexes*, *reset sequences*, +*foreign keys*, *downcase identifiers*, *uniquify index names*, *reindex*. + + - *include drop* + + When this option is listed, pgloader drops all the tables in the target + PostgreSQL database whose names appear in the MySQL database. This + option allows for using the same command several times in a row until + you figure out all the options, starting automatically from a clean + environment. Please note that `CASCADE` is used to ensure that tables + are dropped even if there are foreign keys pointing to them. This is + precisely what `include drop` is intended to do: drop all target tables + and recreate them. + + Great care needs to be taken when using `include drop`, as it will + cascade to *all* objects referencing the target tables, possibly + including other tables that are not being loaded from the source DB. + + - *include no drop* + + When this option is listed, pgloader will not include any `DROP` + statement when loading the data. + + - *truncate* + + When this option is listed, pgloader issue the `TRUNCATE` command + against each PostgreSQL table just before loading data into it. + + - *no truncate* + + When this option is listed, pgloader issues no `TRUNCATE` command. + + - *disable triggers* + + When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE + TRIGGER ALL` command against the PostgreSQL target table before copying + the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the + `COPY` is done. + + This option allows loading data into a pre-existing table ignoring the + *foreign key constraints* and user defined triggers and may result in + invalid *foreign key constraints* once the data is loaded. Use with + care. + + - *create tables* + + When this option is listed, pgloader creates the table using the meta + data found in the `MySQL` file, which must contain a list of fields with + their data type. A standard data type conversion from DBF to PostgreSQL + is done. + + - *create no tables* + + When this option is listed, pgloader skips the creation of table before + loading data, target tables must then already exist. + + Also, when using *create no tables* pgloader fetches the metadata from + the current target database and checks type casting, then will remove + constraints and indexes prior to loading the data and install them back + again once the loading is done. + + - *create indexes* + + When this option is listed, pgloader gets the definitions of all the + indexes found in the MySQL database and create the same set of index + definitions against the PostgreSQL database. + + - *create no indexes* + + When this option is listed, pgloader skips the creating indexes. + + - *drop indexes* + + When this option is listed, pgloader drops the indexes in the target + database before loading the data, and creates them again at the end + of the data copy. + + - *reindex* + + When this option is used, pgloader does both *drop indexes* before + loading the data and *create indexes* once data is loaded. + + - *drop schema* + + When this option is listed, pgloader drops the target schema in the + target PostgreSQL database before creating it again and all the objects + it contains. The default behavior doesn't drop the target schemas. + + - *foreign keys* + + When this option is listed, pgloader gets the definitions of all the + foreign keys found in the MySQL database and create the same set of + foreign key definitions against the PostgreSQL database. + + - *no foreign keys* + + When this option is listed, pgloader skips creating foreign keys. + + - *reset sequences* + + When this option is listed, at the end of the data loading and after the + indexes have all been created, pgloader resets all the PostgreSQL + sequences created to the current maximum value of the column they are + attached to. + + The options *schema only* and *data only* have no effects on this + option. + + - *reset no sequences* + + When this option is listed, pgloader skips resetting sequences after the + load. + + The options *schema only* and *data only* have no effects on this + option. + + - *downcase identifiers* + + When this option is listed, pgloader converts all MySQL identifiers + (table names, index names, column names) to *downcase*, except for + PostgreSQL *reserved* keywords. + + The PostgreSQL *reserved* keywords are determined dynamically by using + the system function `pg_get_keywords()`. + + - *quote identifiers* + + When this option is listed, pgloader quotes all MySQL identifiers so + that their case is respected. Note that you will then have to do the + same thing in your application code queries. + + - *schema only* + + When this option is listed pgloader refrains from migrating the data + over. Note that the schema in this context includes the indexes when the + option *create indexes* has been listed. + + - *data only* + + When this option is listed pgloader only issues the `COPY` statements, + without doing any other processing. + + - *rows per range* + + How many rows are fetched per `SELECT` query when using *multiple + readers per thread*, see above for details. + +PostgreSQL Database Casting Rules +--------------------------------- + +The command *CAST* introduces user-defined casting rules. + +The cast clause allows to specify custom casting rules, either to overload +the default casting rules or to amend them with special cases. + +A casting rule is expected to follow one of the forms:: + + type [ ... ] to [