From ff78ebf048d43c1c953632eabd9c9941111a39c7 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 14 May 2015 21:08:19 +0200 Subject: [PATCH] Improve SQLite values parsing, fix #231. It turns out that SQLite3 data type handling is back to kick us wherever it hurts, this time by the driver deciding to return blob data (a vector of unsigned bytes) when we expect properly encoded text data. In the wikipedia data test case used to reproduce the bug, we're lucky enough that the byte vectors actually map to properly encoded strings. Of course doing the proper thing costs some performances. I'd like to be able to decide if I should blame the SQLite driver or the whole product on this one. The per-value data type handling still is a disaster in my book, tho, which means it's crucially important for pgloader to get it right and allow users to seemlessly migrate away from using such a system. --- src/sources/sqlite/sqlite-cast-rules.lisp | 5 --- src/sources/sqlite/sqlite.lisp | 45 ++++++++++++++++++++--- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/sources/sqlite/sqlite-cast-rules.lisp b/src/sources/sqlite/sqlite-cast-rules.lisp index fef19b8..d77adf6 100644 --- a/src/sources/sqlite/sqlite-cast-rules.lisp +++ b/src/sources/sqlite/sqlite-cast-rules.lisp @@ -84,11 +84,6 @@ ;; COPY protocol. (values column (or fn (lambda (val) (if val (format nil "~a" val) :null)))))) -(defmethod cast-to-bytea-p ((col coldef)) - "Returns a generalized boolean, non-nil when the column is casted to a - PostgreSQL bytea column." - (string= "bytea" (cast-sqlite-column-definition-to-pgsql col))) - (defmethod format-pgsql-column ((col coldef)) "Return a string representing the PostgreSQL column definition." (let* ((column-name (apply-identifier-case (coldef-name col))) diff --git a/src/sources/sqlite/sqlite.lisp b/src/sources/sqlite/sqlite.lisp index 6a2c333..5b42a4a 100644 --- a/src/sources/sqlite/sqlite.lisp +++ b/src/sources/sqlite/sqlite.lisp @@ -44,14 +44,44 @@ ;;; Map a function to each row extracted from SQLite ;;; +(defun sqlite-encoding (db) + "Return a BABEL suitable encoding for the SQLite db handle." + (let ((encoding-string (sqlite:execute-single db "pragma encoding;"))) + (cond ((string-equal encoding-string "UTF-8") :utf-8) + ((string-equal encoding-string "UTF-16") :utf-16) + ((string-equal encoding-string "UTF-16le") :utf-16le) + ((string-equal encoding-string "UTF-16be") :utf-16be)))) + +(declaim (inline parse-value)) + +(defun parse-value (value sqlite-type pgsql-type &key (encoding :utf-8)) + "Parse value given by SQLite to match what PostgreSQL is expecting. + In some cases SQLite will give text output for a blob column (it's + base64) and at times will output binary data for text (utf-8 byte + vector)." + (cond ((and (string-equal "text" pgsql-type) + (eq :blob sqlite-type) + (not (stringp value))) + ;; we expected a properly encoded string and received bytes instead + (babel:octets-to-string value :encoding encoding)) + + ((and (string-equal "bytea" pgsql-type) + (stringp value)) + ;; we expected bytes and got a string instead, must be base64 encoded + (base64:base64-string-to-usb8-array value)) + + ;; default case, just use what's been given to us + (t value))) + (defmethod map-rows ((sqlite copy-sqlite) &key process-row-fn) "Extract SQLite data and call PROCESS-ROW-FN function with a single argument (a list of column values) for each row" (let ((sql (format nil "SELECT * FROM ~a" (source sqlite))) - (blobs-p - (coerce (mapcar #'cast-to-bytea-p (fields sqlite)) 'vector))) + (pgtypes (map 'vector #'cast-sqlite-column-definition-to-pgsql + (fields sqlite)))) (with-connection (*sqlite-db* (source-db sqlite)) - (let ((db (conn-handle *sqlite-db*))) + (let* ((db (conn-handle *sqlite-db*)) + (encoding (sqlite-encoding db))) (handler-case (loop with statement = (sqlite:prepare-statement db sql) @@ -62,9 +92,12 @@ for row = (let ((v (make-array len))) (loop :for x :below len :for raw := (sqlite:statement-column-value statement x) - :for val := (if (and (aref blobs-p x) (stringp raw)) - (base64:base64-string-to-usb8-array raw) - raw) + :for ptype := (aref pgtypes x) + :for stype := (sqlite-ffi:sqlite3-column-type + (sqlite::handle statement) + x) + :for val := (parse-value raw stype ptype + :encoding encoding) :do (setf (aref v x) val)) v) counting t into rows