mirror of
https://github.com/dimitri/pgloader.git
synced 2025-08-07 23:07:00 +02:00
The idea is for pgloader to tweak the schema from a description of the sharding model, the distribute clause. Here's an example of such a clause: distribute company using id distribute campaign using company_id distribute ads using company_id from campaign distribute clicks using company_id from ads, campaign Given such commands, pgloader adds the distibution key to the table when needed, to the primary key definition of the table, and also to the foreign keys that are pointing to the changed primary key. Then when SELECTing the data from the source database, the idea is for pgloader to automatically JOIN the base table with the source table where to find the distribution key, in case it was just added in the schema. Finally, pgloader also calls the following Citus commands: SELECT create_distributed_table('company', 'id'); SELECT create_distributed_table('campaign', 'company_id'); SELECT create_distributed_table('ads', 'company_id'); SELECT create_distributed_table('clicks', 'company_id');
489 lines
21 KiB
Common Lisp
489 lines
21 KiB
Common Lisp
;;;
|
||
;;; Generic API for pgloader sources
|
||
;;; Methods for database source types (with introspection)
|
||
;;;
|
||
|
||
(in-package :pgloader.load)
|
||
|
||
;;;
|
||
;;; Prepare the PostgreSQL database before streaming the data into it.
|
||
;;;
|
||
(defmethod prepare-pgsql-database ((copy db-copy)
|
||
(catalog catalog)
|
||
&key
|
||
truncate
|
||
create-tables
|
||
create-schemas
|
||
drop-schema
|
||
drop-indexes
|
||
set-table-oids
|
||
materialize-views
|
||
foreign-keys
|
||
include-drop
|
||
distribute)
|
||
"Prepare the target PostgreSQL database: create tables casting datatypes
|
||
from the MySQL definitions, prepare index definitions and create target
|
||
tables for materialized views.
|
||
|
||
That function mutates index definitions in ALL-INDEXES."
|
||
(log-message :notice "Prepare PostgreSQL database.")
|
||
|
||
(with-pgsql-transaction (:pgconn (target-db copy))
|
||
|
||
(finalize-catalogs catalog (pgconn-variant (target-db copy)))
|
||
|
||
(if create-tables
|
||
(progn
|
||
(when create-schemas
|
||
(with-stats-collection ("Create Schemas" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(create-schemas catalog
|
||
:include-drop drop-schema
|
||
:client-min-messages :error)))
|
||
|
||
;; create new SQL types (ENUMs, SETs) if needed and before we
|
||
;; get to the table definitions that will use them
|
||
(with-stats-collection ("Create SQL Types" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
;; some SQL types come from extensions (ip4r, hstore, etc)
|
||
(create-extensions catalog
|
||
:include-drop include-drop
|
||
:if-not-exists t
|
||
:client-min-messages :error)
|
||
|
||
(create-sqltypes catalog
|
||
:include-drop include-drop
|
||
:client-min-messages :error))
|
||
|
||
;; now the tables
|
||
(with-stats-collection ("Create tables" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(create-tables catalog
|
||
:include-drop include-drop
|
||
:client-min-messages :error)))
|
||
|
||
(progn
|
||
;; if we're not going to create the tables, now is the time to
|
||
;; remove the constraints: indexes, primary keys, foreign keys
|
||
;;
|
||
;; to be able to do that properly, get the constraints from
|
||
;; the pre-existing target database catalog
|
||
(let ((pgsql-catalog
|
||
(fetch-pgsql-catalog (db-name (target-db copy))
|
||
:source-catalog catalog)))
|
||
(merge-catalogs catalog pgsql-catalog))
|
||
|
||
;; now the foreign keys and only then the indexes, because a
|
||
;; drop constraint on a primary key cascades to the drop of
|
||
;; any foreign key that targets the primary key
|
||
(when foreign-keys
|
||
(with-stats-collection ("Drop Foreign Keys" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(drop-pgsql-fkeys catalog :log-level :notice)))
|
||
|
||
(when drop-indexes
|
||
(with-stats-collection ("Drop Indexes" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
;; we want to error out early in case we can't DROP the
|
||
;; index, don't CASCADE
|
||
(drop-indexes catalog :cascade nil :log-level :notice)))
|
||
|
||
(when truncate
|
||
(with-stats-collection ("Truncate" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(truncate-tables catalog)))))
|
||
|
||
;; Some database sources allow the same index name being used
|
||
;; against several tables, so we add the PostgreSQL table OID in the
|
||
;; index name, to differenciate. Set the table oids now.
|
||
(when (and create-tables set-table-oids)
|
||
(with-stats-collection ("Set Table OIDs" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(set-table-oids catalog :variant (pgconn-variant (target-db copy)))))
|
||
|
||
;; We might have to MATERIALIZE VIEWS
|
||
(when (and create-tables materialize-views)
|
||
(with-stats-collection ("Create MatViews Tables" :section :pre
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(create-views catalog
|
||
:include-drop include-drop
|
||
:client-min-messages :error)))
|
||
|
||
;; Citus Support
|
||
(when distribute
|
||
(with-stats-collection ("Citus Distribute Tables" :section :pre)
|
||
(let ((citus-sql
|
||
(loop :for rule :in distribute
|
||
:collect (format-create-sql rule))))
|
||
(pgsql-execute citus-sql :client-min-messages :notice)))))
|
||
|
||
;; log the catalog we just fetched and (maybe) merged
|
||
(log-message :data "CATALOG: ~s" catalog))
|
||
|
||
|
||
(defmethod complete-pgsql-database ((copy db-copy)
|
||
(catalog catalog)
|
||
pkeys
|
||
&key
|
||
foreign-keys
|
||
create-indexes
|
||
create-triggers
|
||
reset-sequences)
|
||
"After loading the data into PostgreSQL, we can now reset the sequences
|
||
and declare foreign keys."
|
||
;;
|
||
;; Now Reset Sequences, the good time to do that is once the whole data
|
||
;; has been imported and once we have the indexes in place, as max() is
|
||
;; able to benefit from the indexes. In particular avoid doing that step
|
||
;; while CREATE INDEX statements are in flight (avoid locking).
|
||
;;
|
||
(log-message :notice "Completing PostgreSQL database.")
|
||
|
||
(when reset-sequences
|
||
(reset-sequences (clone-connection (target-db copy)) catalog))
|
||
|
||
(handler-case
|
||
(with-pgsql-transaction (:pgconn (clone-connection (target-db copy)))
|
||
;;
|
||
;; Turn UNIQUE indexes into PRIMARY KEYS now
|
||
;;
|
||
(when create-indexes
|
||
(pgsql-execute-with-timing :post "Primary Keys" pkeys
|
||
:log-level :notice))
|
||
|
||
;;
|
||
;; Foreign Key Constraints
|
||
;;
|
||
;; We need to have finished loading both the reference and the
|
||
;; refering tables to be able to build the foreign keys, so wait
|
||
;; until all tables and indexes are imported before doing that.
|
||
;;
|
||
(when foreign-keys
|
||
(create-pgsql-fkeys catalog
|
||
:section :post
|
||
:label "Create Foreign Keys"
|
||
:log-level :notice))
|
||
|
||
;;
|
||
;; Triggers and stored procedures -- includes special default values
|
||
;;
|
||
(when create-triggers
|
||
(create-triggers catalog
|
||
:section :post
|
||
:label "Create Triggers"))
|
||
|
||
;;
|
||
;; Add schemas that needs to be in the search_path to the database
|
||
;; search_path, when using PostgreSQL. Redshift doesn't know how to
|
||
;; do that, unfortunately.
|
||
;;
|
||
(unless (eq :redshift (pgconn-variant (target-db copy)))
|
||
(add-to-search-path catalog
|
||
:section :post
|
||
:label "Set Search Path"))
|
||
|
||
;;
|
||
;; And now, comments on tables and columns.
|
||
;;
|
||
(comment-on-tables-and-columns catalog
|
||
:section :post
|
||
:label "Install Comments"))
|
||
|
||
(postgresql-unavailable (condition)
|
||
|
||
(log-message :error "~a" condition)
|
||
(log-message :error
|
||
"Complete PostgreSQL database reconnecting to PostgreSQL.")
|
||
|
||
;; in order to avoid Socket error in "connect": ECONNREFUSED if we
|
||
;; try just too soon, wait a little
|
||
(sleep 2)
|
||
|
||
|
||
;;
|
||
;; Reset Sequence can be done several times safely, and the rest of the
|
||
;; operations run in a single transaction, so if the connection was lost,
|
||
;; nothing has been done. Retry.
|
||
;;
|
||
(complete-pgsql-database copy
|
||
catalog
|
||
pkeys
|
||
:foreign-keys foreign-keys
|
||
:create-indexes create-indexes
|
||
:create-triggers create-triggers
|
||
:reset-sequences reset-sequences))))
|
||
|
||
|
||
(defun process-catalog (copy catalog &key alter-table alter-schema distribute)
|
||
"Do all the PostgreSQL catalog tweaking here: casts, index WHERE clause
|
||
rewriting, pgloader level alter schema and alter table commands."
|
||
|
||
;; cast the catalog into something PostgreSQL can work on
|
||
(cast catalog)
|
||
|
||
;; support code for index filters (where clauses)
|
||
(process-index-definitions catalog :sql-dialect (class-name (class-of copy)))
|
||
|
||
;; we may have to alter schemas
|
||
(when alter-schema
|
||
(alter-schema catalog alter-schema))
|
||
|
||
;; if asked, now alter the catalog with given rules: the alter-table
|
||
;; keyword parameter actually contains a set of alter table rules.
|
||
(when alter-table
|
||
(alter-table catalog alter-table))
|
||
|
||
;; we also support schema changes necessary for Citus distribution
|
||
(when distribute
|
||
(pgloader.catalog::citus-distribute-schema catalog distribute)))
|
||
|
||
|
||
;;;
|
||
;;; Generic enough implementation of the copy-database method.
|
||
;;;
|
||
(defmethod copy-database ((copy db-copy)
|
||
&key
|
||
(on-error-stop *on-error-stop*)
|
||
(worker-count 4)
|
||
(concurrency 1)
|
||
(multiple-readers nil)
|
||
max-parallel-create-index
|
||
(truncate nil)
|
||
(disable-triggers nil)
|
||
(data-only nil)
|
||
(schema-only nil)
|
||
(create-schemas t)
|
||
(create-tables t)
|
||
(include-drop t)
|
||
(drop-schema nil)
|
||
(create-indexes t)
|
||
(index-names :uniquify)
|
||
(reset-sequences t)
|
||
(foreign-keys t)
|
||
(reindex nil)
|
||
(after-schema nil)
|
||
distribute
|
||
only-tables
|
||
including
|
||
excluding
|
||
set-table-oids
|
||
alter-table
|
||
alter-schema
|
||
materialize-views)
|
||
"Export database source data and Import it into PostgreSQL"
|
||
(log-message :log "Migrating from ~a" (source-db copy))
|
||
(log-message :log "Migrating into ~a" (target-db copy))
|
||
(let* ((*on-error-stop* on-error-stop)
|
||
(copy-data (or data-only (not schema-only)))
|
||
(create-ddl (or schema-only (not data-only)))
|
||
(create-tables (and create-tables create-ddl))
|
||
(create-schemas (and create-schemas create-ddl))
|
||
;; foreign keys has a special meaning in data-only mode
|
||
(foreign-keys (if (eq :redshift (pgconn-variant (target-db copy)))
|
||
nil
|
||
foreign-keys))
|
||
(drop-indexes (if (eq :redshift (pgconn-variant (target-db copy)))
|
||
nil
|
||
(or reindex
|
||
(and include-drop create-ddl))))
|
||
(create-indexes (if (eq :redshift (pgconn-variant (target-db copy)))
|
||
nil
|
||
(or reindex
|
||
(and create-indexes drop-indexes create-ddl))))
|
||
|
||
(reset-sequences (if (eq :redshift (pgconn-variant (target-db copy)))
|
||
nil
|
||
reset-sequences))
|
||
|
||
(*preserve-index-names*
|
||
(or (eq :preserve index-names)
|
||
;; if we didn't create the tables, we are re-installing the
|
||
;; pre-existing indexes
|
||
(not create-tables)))
|
||
|
||
(copy-kernel (make-kernel worker-count))
|
||
(copy-channel (let ((lp:*kernel* copy-kernel)) (lp:make-channel)))
|
||
(catalog (fetch-metadata
|
||
copy
|
||
(make-catalog
|
||
:name (typecase (source-db copy)
|
||
(db-connection (db-name (source-db copy)))
|
||
(fd-connection (pathname-name
|
||
(fd-path (source-db copy))))))
|
||
:materialize-views materialize-views
|
||
:create-indexes create-indexes
|
||
:foreign-keys foreign-keys
|
||
:only-tables only-tables
|
||
:including including
|
||
:excluding excluding))
|
||
pkeys
|
||
(writers-count (make-hash-table :size (count-tables catalog)))
|
||
(max-indexes (when create-indexes
|
||
(max-indexes-per-table catalog)))
|
||
(idx-kernel (when (and max-indexes (< 0 max-indexes))
|
||
(make-kernel (or max-parallel-create-index
|
||
max-indexes))))
|
||
(idx-channel (when idx-kernel
|
||
(let ((lp:*kernel* idx-kernel))
|
||
(lp:make-channel))))
|
||
|
||
(task-count 0))
|
||
|
||
;; apply catalog level transformations to support the database migration
|
||
;; that's CAST rules, index WHERE clause rewriting and ALTER commands
|
||
(process-catalog copy catalog
|
||
:alter-table alter-table
|
||
:alter-schema alter-schema
|
||
:distribute distribute)
|
||
|
||
;; if asked, first drop/create the tables on the PostgreSQL side
|
||
(handler-case
|
||
(progn
|
||
(prepare-pgsql-database copy
|
||
catalog
|
||
:truncate truncate
|
||
:create-tables create-tables
|
||
:create-schemas create-schemas
|
||
:drop-indexes drop-indexes
|
||
:drop-schema drop-schema
|
||
:include-drop include-drop
|
||
:foreign-keys foreign-keys
|
||
:set-table-oids set-table-oids
|
||
:materialize-views materialize-views
|
||
:distribute distribute)
|
||
|
||
;; if there's an AFTER SCHEMA DO/EXECUTE command, now is the time
|
||
;; to run it.
|
||
(when after-schema
|
||
(pgloader.parser::execute-sql-code-block (target-db copy)
|
||
:pre
|
||
after-schema
|
||
"after schema")))
|
||
;;
|
||
;; In case some error happens in the preparatory transaction, we
|
||
;; need to stop now and refrain from trying to load the data into
|
||
;; an incomplete schema.
|
||
;;
|
||
(cl-postgres:database-error (e)
|
||
(declare (ignore e)) ; a log has already been printed
|
||
(log-message :fatal "Failed to create the schema, see above.")
|
||
|
||
;; we might have some cleanup to do...
|
||
(cleanup copy catalog :materialize-views materialize-views)
|
||
|
||
(return-from copy-database)))
|
||
|
||
(loop
|
||
:for table :in (append (table-list catalog)
|
||
;; when materialized views are not supported,
|
||
;; view-list is empty here
|
||
(view-list catalog))
|
||
|
||
:do (let ((table-source (instanciate-table-copy-object copy table)))
|
||
;; first COPY the data from source to PostgreSQL, using copy-kernel
|
||
(if (not copy-data)
|
||
;; start indexing straight away then
|
||
(when create-indexes
|
||
(alexandria:appendf
|
||
pkeys
|
||
(create-indexes-in-kernel (target-db copy)
|
||
table
|
||
idx-kernel
|
||
idx-channel)))
|
||
|
||
;; prepare the writers-count hash-table, as we start
|
||
;; copy-from, we have concurrency tasks writing.
|
||
(progn ; when copy-data
|
||
(setf (gethash table writers-count) concurrency)
|
||
|
||
(incf task-count
|
||
(copy-from table-source
|
||
:concurrency concurrency
|
||
:multiple-readers multiple-readers
|
||
:kernel copy-kernel
|
||
:channel copy-channel
|
||
:on-error-stop on-error-stop
|
||
:disable-triggers disable-triggers))))))
|
||
|
||
;; now end the kernels
|
||
;; and each time a table is done, launch its indexing
|
||
(when copy-data
|
||
(let ((lp:*kernel* copy-kernel))
|
||
(with-stats-collection ("COPY Threads Completion" :section :post
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(loop :repeat task-count
|
||
:do (destructuring-bind (task table seconds)
|
||
(lp:receive-result copy-channel)
|
||
(log-message :debug
|
||
"Finished processing ~a for ~s ~50T~6$s"
|
||
task (format-table-name table) seconds)
|
||
(when (eq :writer task)
|
||
;;
|
||
;; Start the CREATE INDEX parallel tasks only when
|
||
;; the data has been fully copied over to the
|
||
;; corresponding table, that's when the writers
|
||
;; count is down to zero.
|
||
;;
|
||
(decf (gethash table writers-count))
|
||
(log-message :debug "writers-counts[~a] = ~a"
|
||
(format-table-name table)
|
||
(gethash table writers-count))
|
||
|
||
(when (and create-indexes
|
||
(zerop (gethash table writers-count)))
|
||
(log-message :notice "DONE copying ~a"
|
||
(format-table-name table))
|
||
(alexandria:appendf
|
||
pkeys
|
||
(create-indexes-in-kernel (target-db copy)
|
||
table
|
||
idx-kernel
|
||
idx-channel)))))
|
||
:finally (progn
|
||
(lp:end-kernel :wait nil)
|
||
(return worker-count))))))
|
||
|
||
(log-message :info "Done with COPYing data, waiting for indexes")
|
||
|
||
(when create-indexes
|
||
(let ((lp:*kernel* idx-kernel))
|
||
;; wait until the indexes are done being built...
|
||
;; don't forget accounting for that waiting time.
|
||
(with-stats-collection ("Index Build Completion" :section :post
|
||
:use-result-as-read t
|
||
:use-result-as-rows t)
|
||
(loop :for count :below (count-indexes catalog)
|
||
:do (lp:receive-result idx-channel))
|
||
(lp:end-kernel :wait t)
|
||
(log-message :info "Done waiting for indexes")
|
||
(count-indexes catalog))))
|
||
|
||
;;
|
||
;; Complete the PostgreSQL database before handing over.
|
||
;;
|
||
(complete-pgsql-database copy
|
||
catalog
|
||
pkeys
|
||
:foreign-keys foreign-keys
|
||
:create-indexes create-indexes
|
||
;; only create triggers (for default values)
|
||
;; when we've been responsible for creating the
|
||
;; tables -- otherwise assume the schema is
|
||
;; good as it is
|
||
:create-triggers create-tables
|
||
:reset-sequences reset-sequences)
|
||
|
||
;;
|
||
;; Time to cleanup!
|
||
;;
|
||
(cleanup copy catalog :materialize-views materialize-views)))
|