diff --git a/pgloader.1.md b/pgloader.1.md index 57a495a..1bed108 100644 --- a/pgloader.1.md +++ b/pgloader.1.md @@ -816,7 +816,8 @@ Here's an example: -- INCLUDING ONLY TABLE NAMES MATCHING ~/film/, 'actor' -- EXCLUDING TABLE NAMES MATCHING ~ - + -- DECODING TABLE NAMES MATCHING ~/messed/, ~/encoding/ AS utf8 + BEFORE LOAD DO $$ create schema if not exists sakila; $$; @@ -1063,12 +1064,24 @@ The `database` command accepts the following clauses and options: - *EXCLUDING TABLE NAMES MATCHING* - Introduce a comma separated list of table names or *rugular expression* + Introduce a comma separated list of table names or *regular expression* used to exclude table names from the migration. This filter only applies to the result of the *INCLUDING* filter. EXCLUDING TABLE NAMES MATCHING ~ + - *DECODING TABLE NAMES MATCHING* + + Introduce a comma separated list of table names or *regular expressions* + used to force the encoding to use when processing data from MySQL. If + the data encoding known to you is different from MySQL's idea about it, + this is the option to use. + + DECODING TABLE NAMES MATCHING ~/messed/, ~/encoding/ AS utf8 + + You can use as many such rules as you need, all with possibly different + encodings. + - *BEFORE LOAD DO* You can run SQL queries against the database before loading the data diff --git a/src/parser.lisp b/src/parser.lisp index 5aa8ede..dadf284 100644 --- a/src/parser.lisp +++ b/src/parser.lisp @@ -93,6 +93,7 @@ (def-keyword-rule "log") (def-keyword-rule "level") (def-keyword-rule "encoding") + (def-keyword-rule "decoding") (def-keyword-rule "truncate") (def-keyword-rule "lines") (def-keyword-rule "fields") @@ -804,6 +805,20 @@ (declare (ignore e table n m)) filter-list))) + +;;; +;;; Per table encoding options, because MySQL is so bad at encoding... +;;; +(defrule decoding-table-as (and kw-decoding kw-table kw-names kw-matching + filter-list + kw-as encoding) + (:lambda (source) + (destructuring-bind (d table n m filter-list as encoding) source + (declare (ignore d table n m as)) + (cons encoding filter-list)))) + +(defrule decoding-tables-as (* decoding-table-as)) + ;;; LOAD DATABASE FROM mysql:// (defrule load-mysql-database (and database-source target @@ -813,12 +828,13 @@ (? materialize-views) (? including) (? excluding) + (? decoding-tables-as) (? before-load-do) (? after-load-do)) (:lambda (source) (destructuring-bind (my-db-uri pg-db-uri options gucs casts views - incl excl + incl excl decoding-as before after) source (destructuring-bind (&key ((:host myhost)) @@ -867,6 +883,7 @@ `(:only-tables ',(list table-name))) :including ',incl :excluding ',excl + :decoding-as ',decoding-as :materialize-views ',views :state-before state-before :state-after state-after diff --git a/src/sources/mysql.lisp b/src/sources/mysql.lisp index 690df21..cb96b27 100644 --- a/src/sources/mysql.lisp +++ b/src/sources/mysql.lisp @@ -4,7 +4,10 @@ (in-package :pgloader.mysql) -(defclass copy-mysql (copy) () +(defclass copy-mysql (copy) + ((encoding :accessor encoding ; allows forcing encoding + :initarg :encoding + :initform nil)) (:documentation "pgloader MySQL Data Source")) (defun cast-mysql-column-definition-to-pgsql (mysql-column) @@ -60,10 +63,14 @@ (defmethod map-rows ((mysql copy-mysql) &key process-row-fn) "Extract MySQL data and call PROCESS-ROW-FN function with a single argument (a list of column values) for each row." - (let ((dbname (source-db mysql)) - (table-name (source mysql))) + (let ((dbname (source-db mysql)) + (table-name (source mysql)) + (qmynd:*mysql-encoding* (encoding mysql))) (with-mysql-connection (dbname) + (when qmynd:*mysql-encoding* + (log-message :notice "Force encoding to ~a for ~a" + qmynd:*mysql-encoding* table-name)) (let* ((cols (get-column-list dbname table-name)) (sql (format nil "SELECT ~{~a~^, ~} FROM `~a`;" cols table-name)) (row-fn @@ -269,6 +276,18 @@ :all-indexes all-indexes :view-columns view-columns))) +(defun apply-decoding-as-filters (table-name filters) + "Return a generialized boolean which is non-nil only if TABLE-NAME matches + one of the FILTERS." + (flet ((apply-filter (filter) + ;; we close over table-name here. + (typecase filter + (string (string-equal filter table-name)) + (list (destructuring-bind (type val) filter + (ecase type + (:regex (cl-ppcre:scan val table-name)))))))) + (some #'apply-filter filters))) + ;;; ;;; Work on all tables for given database ;;; @@ -289,6 +308,7 @@ only-tables including excluding + decoding-as materialize-views) "Export MySQL data and Import it into PostgreSQL" (let* ((summary (null *state*)) @@ -346,15 +366,25 @@ (loop for (table-name . columns) in (append all-columns view-columns) do - (let ((table-source - (make-instance 'copy-mysql - :source-db dbname - :target-db pg-dbname - :source table-name - :target (apply-identifier-case table-name - identifier-case) - :fields columns))) + (let* ((encoding + ;; force the data encoding when asked to + (when decoding-as + (loop :for (encoding . filters) :in decoding-as + :when (apply-decoding-as-filters table-name filters) + :return encoding))) + + (table-source + (make-instance 'copy-mysql + :source-db dbname + :target-db pg-dbname + :source table-name + :target (apply-identifier-case table-name + identifier-case) + :fields columns + :encoding encoding))) + (log-message :debug "TARGET: ~a" (target table-source)) + ;; first COPY the data from MySQL to PostgreSQL, using copy-kernel (unless schema-only (copy-from table-source :kernel copy-kernel :truncate truncate)) diff --git a/test/parse/hans.goeuro.load b/test/parse/hans.goeuro.load index 8c121ec..b8e6ab9 100644 --- a/test/parse/hans.goeuro.load +++ b/test/parse/hans.goeuro.load @@ -16,12 +16,15 @@ LOAD DATABASE column enumerate.foo using empty-string-to-null - -- INCLUDING ONLY TABLE NAMES MATCHING ~/encoding/; - MATERIALIZE VIEWS d as $$ select cast(d as date) as d, count(*) as n from plop where d > '2013-10-02' group by cast(d as date); - $$; + $$ + + -- INCLUDING ONLY TABLE NAMES MATCHING ~/encoding/ + + DECODING TABLE NAMES MATCHING ~/messed/, ~/encoding/ AS utf8; +