From 718ac80560acd75e9cfc6839f1fff39803172db8 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Sat, 16 Nov 2013 21:35:17 +0100 Subject: [PATCH] Implement a Materiaze Views option for MySQL. --- pgloader.1.md | 22 +++++++++++++- src/parser.lisp | 40 ++++++++++++++++++++++++- src/sources/mysql-schema.lisp | 56 +++++++++++++++++++++++++++++++++-- src/sources/mysql.lisp | 31 ++++++++++++++++--- test/parse/hans.goeuro.load | 10 ++++++- test/sakila.load | 2 ++ 6 files changed, 151 insertions(+), 10 deletions(-) diff --git a/pgloader.1.md b/pgloader.1.md index 97e2ca3..499bb18 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -720,7 +720,9 @@ Here's an example: type date drop not null drop default using zero-dates-to-null, -- type tinyint to boolean using tinyint-to-boolean, type year to integer - + + MATERIALIZE VIEWS film_list, staff_list + -- INCLUDING ONLY TABLE NAMES MATCHING ~/film/, 'actor' -- EXCLUDING TABLE NAMES MATCHING ~ @@ -882,6 +884,24 @@ The `database` command accepts the following clauses and options: be found un the `pgloader.transforms` Common Lisp package. See above for details. + - *MATERIALIZE VIEWS* + + This clause allows you to implement custom data processing at the data + source by providing a *view definition* against which pgloader will + query the data. It's not possible to just allow for plain `SQL` because + we want to know a lot about the exact data types of each column involved + in the query output. + + This clause expect a comma separated list of view definitions, each one + being either the name of an existing view in your database or the + following expression: + + *name* `AS` `$$` *sql query* + + The *name* and the *sql query* will be used in a `CREATE VIEW` statement + at the beginning of the data loading, and the resulting view will then + be dropped at the end of the data loading. + - *INCLUDING ONLY TABLE NAMES MATCHING* Introduce a comma separated list of table names or *regular expression* diff --git a/src/parser.lisp b/src/parser.lisp index f331694..f67a17b 100644 --- a/src/parser.lisp +++ b/src/parser.lisp @@ -106,6 +106,7 @@ (def-keyword-rule "header") (def-keyword-rule "null") (def-keyword-rule "if") + (def-keyword-rule "as") (def-keyword-rule "blanks") (def-keyword-rule "date") (def-keyword-rule "format") @@ -117,11 +118,13 @@ (def-keyword-rule "only") (def-keyword-rule "drop") (def-keyword-rule "create") + (def-keyword-rule "materialize") (def-keyword-rule "reset") (def-keyword-rule "table") (def-keyword-rule "name") (def-keyword-rule "names") (def-keyword-rule "tables") + (def-keyword-rule "views") (def-keyword-rule "indexes") (def-keyword-rule "sequences") (def-keyword-rule "foreign") @@ -674,6 +677,39 @@ (destructuring-bind (c casts) source (declare (ignore c)) casts))) + + +;;; +;;; Materialize views by copying their data over, allows for doing advanced +;;; ETL processing by having parts of the processing happen on the MySQL +;;; query side. +;;; +(defrule view-name (and (alpha-char-p character) + (* (or (alpha-char-p character) + (digit-char-p character) + #\_))) + (:text t)) + +(defrule view-sql (and kw-as dollar-quoted) + (:destructure (as sql) (declare (ignore as)) sql)) + +(defrule view-definition (and view-name (? view-sql)) + (:destructure (name sql) (cons name sql))) + +(defrule another-view-definition (and #\, ignore-whitespace view-definition) + (:lambda (source) + (destructuring-bind (comma ws view) source + (declare (ignore comma ws)) + view))) + +(defrule views-list (and view-definition (* another-view-definition)) + (:lambda (vlist) + (destructuring-bind (view1 views) vlist + (list* view1 views)))) + +(defrule materialize-views (and kw-materialize kw-views views-list) + (:destructure (mat views list) (declare (ignore mat views)) list)) + ;;; ;;; Including only some tables or excluding some others @@ -713,13 +749,14 @@ (? mysql-options) (? gucs) (? casts) + (? materialize-views) (? including) (? excluding) (? before-load-do) (? after-load-do)) (:lambda (source) (destructuring-bind (my-db-uri pg-db-uri options - gucs casts + gucs casts views incl excl before after) source @@ -767,6 +804,7 @@ `(:only-tables ',(list table-name))) :including ',incl :excluding ',excl + :materialize-views ',views :state-before state-before :state-after state-after :state-indexes state-idx diff --git a/src/sources/mysql-schema.lisp b/src/sources/mysql-schema.lisp index 00fa140..374ebb0 100644 --- a/src/sources/mysql-schema.lisp +++ b/src/sources/mysql-schema.lisp @@ -96,16 +96,66 @@ order by table_name" dbname only-tables)))) ;; free resources (cl-mysql:disconnect))) +(defun create-my-views (dbname views-alist + &key + (host *myconn-host*) + (user *myconn-user*) + (pass *myconn-pass*)) + "VIEWS-ALIST associates view names with their SQL definition, which might + be empty for already existing views. Create only the views for which we + have an SQL definition." + (let ((views (remove-if #'null views-alist :key #'cdr))) + (when views + (cl-mysql:connect :host host :user user :password pass) + (unwind-protect + (progn + (cl-mysql:use dbname) + (loop for (name . def) in views + for sql = (format nil "CREATE VIEW ~a AS ~a" name def) + do + (log-message :info "MySQL: ~a" sql) + (cl-mysql:query sql))) + ;; free resources + (cl-mysql:disconnect))))) + +(defun drop-my-views (dbname views-alist + &key + (host *myconn-host*) + (user *myconn-user*) + (pass *myconn-pass*)) + "See `create-my-views' for VIEWS-ALIST description. This time we DROP the + views to clean out after our work." + (let ((views (remove-if #'null views-alist :key #'cdr))) + (when views + (cl-mysql:connect :host host :user user :password pass) + (unwind-protect + (let ((sql + (format nil "DROP VIEW ~{~a~^, ~};" (mapcar #'car views)))) + (cl-mysql:use dbname) + (log-message :info "MySQL: ~a" sql) + (cl-mysql:query sql)) + ;; free resources + (cl-mysql:disconnect))))) + + ;;; ;;; Tools to get MySQL table and columns definitions and transform them to ;;; PostgreSQL CREATE TABLE statements, and run those. ;;; +(defvar *table-type* '((:table . "BASE TABLE") + (:view . "VIEW")) + "Associate internal table type symbol with what's found in MySQL + information_schema.tables.table_type column.") + (defun list-all-columns (dbname &key only-tables (host *myconn-host*) (user *myconn-user*) - (pass *myconn-pass*)) + (pass *myconn-pass*) + (table-type :table) + &aux + (table-type-name (cdr (assoc table-type *table-type*)))) "Get the list of MySQL column names per table." (cl-mysql:connect :host host :user user :password pass) @@ -121,9 +171,9 @@ order by table_name" dbname only-tables)))) c.is_nullable, c.extra from information_schema.columns c join information_schema.tables t using(table_schema, table_name) - where c.table_schema = '~a' and t.table_type = 'BASE TABLE' + where c.table_schema = '~a' and t.table_type = '~a' ~@[and table_name in (~{'~a'~^,~})~] -order by table_name, ordinal_position" dbname only-tables))) +order by table_name, ordinal_position" dbname table-type-name only-tables))) do (let ((entry (assoc table-name schema :test 'equal)) (column diff --git a/src/sources/mysql.lisp b/src/sources/mysql.lisp index 2fb9983..6c8d3d3 100644 --- a/src/sources/mysql.lisp +++ b/src/sources/mysql.lisp @@ -167,7 +167,8 @@ (identifier-case :downcase) ; or :quote only-tables including - excluding) + excluding + materialize-views) "Export MySQL data and Import it into PostgreSQL" (let* ((summary (null *state*)) (*state* (or *state* (make-pgstate))) @@ -177,6 +178,8 @@ (copy-kernel (make-kernel 2)) (dbname (source-db mysql)) (pg-dbname (target-db mysql)) + (view-names (mapcar #'car materialize-views)) + view-columns ; must wait until we created the views (all-columns (filter-column-list (list-all-columns dbname) :only-tables only-tables :including including @@ -225,20 +228,35 @@ ;; MySQL allows the same index name being used against several ;; tables, so we add the PostgreSQL table OID in the index name, ;; to differenciate. Set the table oids now. - (set-table-oids all-indexes)) + (set-table-oids all-indexes) + + ;; If asked to materialize views, now is the time to create + ;; the target tables for them + (when materialize-views + (create-my-views dbname materialize-views) + (setf view-columns (list-all-columns dbname + :only-tables view-names + :table-type :view)) + (create-tables view-columns + :identifier-case identifier-case + :include-drop include-drop))) ;; ;; In case some error happens in the preparatory transaction, we ;; need to stop now and refrain to try loading the data into an ;; incomplete schema. ;; + (cl-mysql-system:mysql-error (e) + (log-message :fatal "~a" e) + (return-from copy-database)) + (cl-postgres:database-error (e) (declare (ignore e)) ; a log has already been printed - (log-message :critical "Failed to create the schema, see above.") + (log-message :fatal "Failed to create the schema, see above.") (return-from copy-database))))) (loop - for (table-name . columns) in all-columns + for (table-name . columns) in (append all-columns view-columns) do (let ((table-source (make-instance 'copy-mysql @@ -278,6 +296,11 @@ (lp:end-kernel)) ;; + ;; If we created some views for this run, now is the time to DROP'em + ;; + (when materialize-views + (drop-my-views dbname materialize-views)) + ;; ;; Now Reset Sequences, the good time to do that is once the whole data ;; has been imported and once we have the indexes in place, as max() is ;; able to benefit from the indexes. In particular avoid doing that step diff --git a/test/parse/hans.goeuro.load b/test/parse/hans.goeuro.load index 488b421..557249d 100644 --- a/test/parse/hans.goeuro.load +++ b/test/parse/hans.goeuro.load @@ -8,4 +8,12 @@ LOAD DATABASE type date drop not null drop default using zero-dates-to-null, type tinyint to boolean using tinyint-to-boolean, type year to integer, - type timestamp to timestamptz drop not null using zero-dates-to-null; + type timestamp to timestamptz drop not null using zero-dates-to-null + + MATERIALIZE VIEWS + d as $$ + select cast(d as date) as d, count(*) as n + from plop + where d > '2013-10-02' + group by cast(d as date); + $$; diff --git a/test/sakila.load b/test/sakila.load index ffe9552..2fb0308 100644 --- a/test/sakila.load +++ b/test/sakila.load @@ -12,6 +12,8 @@ load database -- type tinyint to boolean using tinyint-to-boolean, type year to integer + MATERIALIZE VIEWS film_list, staff_list + -- INCLUDING ONLY TABLE NAMES MATCHING ~/film/, 'actor' -- EXCLUDING TABLE NAMES MATCHING ~