From 3db3ecf81b0ebfb071a082fc383b182c160b656a Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 23 May 2018 13:43:28 +0200 Subject: [PATCH] Review Redshift data type dumb-down choices. It's a little more involved that what was done previously. In particular we need to pay attention to MySQL varchar(x) and transform them into something big enough when counting bytes rather than chars, like varchar(3x). Then there's the "text" datatype to take into account, and some more. --- src/pgsql/pgsql-finalize-catalogs.lisp | 120 +++++++++++++++++-------- 1 file changed, 84 insertions(+), 36 deletions(-) diff --git a/src/pgsql/pgsql-finalize-catalogs.lisp b/src/pgsql/pgsql-finalize-catalogs.lisp index bed7e40..5a4eeef 100644 --- a/src/pgsql/pgsql-finalize-catalogs.lisp +++ b/src/pgsql/pgsql-finalize-catalogs.lisp @@ -62,6 +62,7 @@ "TIMESTAMPTZ")) (defvar *redshift-decimal-max-precision* 38) +(defvar *redshift-varchar-max-precision* 65535) (defmethod adjust-data-types ((catalog catalog) (variant (eql :redshift))) (dumb-down-data-types-for-redshift catalog)) @@ -85,18 +86,68 @@ :do (dumb-down-data-types-for-redshift column))) (defmethod dumb-down-data-types-for-redshift ((column column)) - (cond ((and (stringp (column-type-name column)) - (member (column-type-name column) - *redshift-supported-data-types* - :test #'string-equal)) - ;; the target data type is supported... but we might have other - ;; situations to deal with, such as - ;; - ;; DECIMAL precision 65 must be between 1 and 38 - (when (and (member (column-type-name column) - '("decimal" "numeric") - :test #'string-equal) - (column-type-mod column)) + (let ((target-data-type "varchar") + (target-type-mod "(512)")) + (cond ( + ;; + ;; when the source column is an ENUM, we know from its source + ;; definition the maximum length of the target column + ;; + (and (typep (column-type-name column) 'sqltype) + (typep (sqltype-extra (column-type-name column)) 'list)) + + (let ((enum-value-list + (sqltype-extra (column-type-name column)))) + (setf target-data-type "varchar") + (setf target-type-mod + (format nil + "(~a)" + ;; times 3 because Redshift doesn't know + ;; how to count chars in unicode, it only + ;; knows how to count bytes + (* 3 + (reduce #'max + enum-value-list + :key #'length + :initial-value 0)))))) + + ;; + ;; When using "text" (unbounded varchar) in MySQL, set the + ;; precision of the varchar to the max of what Redshift can + ;; handle. + ;; + ((string-equal (column-type-name column) "text") + (setf target-data-type "varchar") + (setf target-type-mod + (format nil "(~a)" *redshift-varchar-max-precision*))) + + ;; + ;; when the source field is e.g. a MySQL varchar(12), we can keep + ;; the typemod here, or maybe 3 times the typemod to take into + ;; account Redshift encoding "properties". + ;; + ((string-equal (column-type-name column) "varchar") + (destructuring-bind (precision &optional (scale 0)) + (parse-column-typemod (column-type-name column) + (column-type-mod column)) + (declare (ignore scale)) + ;; Redshift can't count chars, it counts bytes, and we + ;; are dealing with utf-8. + (setf target-type-mod + (format nil "(~a)" + (min (* 3 precision) + *redshift-varchar-max-precision*))))) + + ;; + ;; Redshift has a special limit with decimal and numeric data + ;; types, so we must handle that specifically here: + ;; + ;; DECIMAL precision 65 must be between 1 and 38 + ;; + ((and (member (column-type-name column) + '("decimal" "numeric") + :test #'string-equal) + (column-type-mod column)) (destructuring-bind (precision &optional (scale 0)) (parse-column-typemod (column-type-name column) (column-type-mod column)) @@ -107,30 +158,27 @@ ;; ;; Internal pgloader API want the value as a string. ;; - (setf (column-type-mod column) + (setf target-data-type "numeric") + (setf target-type-mod (format nil "(~a,~a)" - *redshift-decimal-max-precision* scale)))))) + *redshift-decimal-max-precision* scale))))) - (t - (let ((target-data-type "varchar") - (target-type-mod "(512)")) - ;; - ;; TODO: we might want to be smarter about the data type conversion - ;; here, or at least a tad less dumb. For instance: - ;; - ;; - when the source column is an ENUM, we know from its source - ;; definition the maximum length of the target column - ;; - ;; - when the source field is e.g. a MySQL varchar(12), it might be - ;; - nice to keep the typemod here, or maybe 4 times the typemod to - ;; - take into account Redshift encoding "properties". - ;; - (setf (column-type-name column) target-data-type) - (setf (column-type-mod column) target-type-mod) + ;; + ;; Target data type is suppported, just keep it around. + ;; + ((and (stringp (column-type-name column)) + (member (column-type-name column) + *redshift-supported-data-types* + :test #'string-equal)) + (setf target-data-type (column-type-name column)) + (setf target-type-mod (column-type-mod column)))) - (log-message :info - "Redshift support: ~a change type from ~a to ~a~a" - (column-name column) - (column-type-name column) - target-data-type - target-type-mod))))) + (setf (column-type-name column) target-data-type) + (setf (column-type-mod column) target-type-mod) + + (log-message :info + "Redshift support: ~a change type from ~a to ~a~a" + (column-name column) + (column-type-name column) + target-data-type + target-type-mod)))