diff --git a/src/pgsql/copy-format.lisp b/src/pgsql/copy-format.lisp index 116b071..dbfa491 100644 --- a/src/pgsql/copy-format.lisp +++ b/src/pgsql/copy-format.lisp @@ -12,6 +12,7 @@ ;;; minimize data copying in certain cases, and we want to avoid a coerce ;;; call here. ;;; + (defun format-vector-row (row &optional (transforms (make-list (length row))) @@ -21,69 +22,92 @@ See http://www.postgresql.org/docs/9.2/static/sql-copy.html#AEN66609 for details about the format, and format specs." (declare (type simple-array row)) - (let* ((bytes 0) - (buffer (make-array 64 ; a cache line worth of data - :element-type '(unsigned-byte 8) - :adjustable t - :fill-pointer 0))) - (macrolet ((byte-out (byte) - `(progn (vector-push-extend ,byte buffer) - (incf bytes))) - (esc-byte-out (byte) - `(progn (vector-push-extend #. (char-code #\\) buffer) - (vector-push-extend ,byte buffer) - (incf bytes 2)))) - (loop - :with nbcols := (length row) - :for raw-col :across row - :for i :from 1 - :for more? := (< i nbcols) - :for fn :in transforms - :for col := (if pre-formatted raw-col - (if fn (funcall fn raw-col) raw-col)) - :do - (if (or (null col) - ;; still accept postmodern :NULL in "preprocessed" data - (eq :NULL col)) - (progn - ;; NULL is expected as \N, two chars - (byte-out #. (char-code #\\)) - (byte-out #. (char-code #\N))) - (if pre-formatted - (let* ((data - (cl-postgres-trivial-utf-8:string-to-utf-8-bytes col)) - (len (length data)) - (newbuffer - (make-array (+ bytes len) - :element-type '(unsigned-byte 8) - :adjustable t - :fill-pointer t))) - (replace newbuffer buffer :start1 0) - (replace newbuffer data :start1 bytes) - (setf buffer newbuffer) - (incf bytes len)) + ;; first prepare an array of transformed and properly encoded columns + (let* ((nbcols (length row)) + (pgrow (make-array nbcols :element-type 'array))) + (loop :for raw-col :across row + :for i :from 0 + :for fn :in transforms + :for col := (if pre-formatted raw-col + (if fn (funcall fn raw-col) raw-col)) + :do (setf (aref pgrow i) + (if (or (null col) (eq :NULL col)) + nil + (cl-postgres-trivial-utf-8:string-to-utf-8-bytes col)))) - (loop - ;; From PostgreSQL docs: - ;; - ;; In particular, the following characters must be preceded - ;; by a backslash if they appear as part of a column value: - ;; backslash itself, newline, carriage return, and the - ;; current delimiter character. - :for byte - :across (cl-postgres-trivial-utf-8:string-to-utf-8-bytes col) - :do (case byte + ;; now that we have all the columns, make a simple array out of them + (if pre-formatted + ;; pre-formatted data means we can return it as it already is + (flet ((col-length (col) + ;; NULL is \N (2 bytes) in COPY format + (if col (length col) 2))) + (let* ((bytes (+ nbcols (reduce '+ (map 'list #'col-length pgrow)))) + (result (make-array bytes :element-type '(unsigned-byte 8)))) + (loop :for start := 0 :then (+ start len 1) + :for col :across pgrow + :for len := (if col (length col) 2) + :do (progn + (if col + (replace result + (the (simple-array (unsigned-byte 8) (*)) col) + :start1 start) + ;; insert \N for a null value + (setf (aref result start) #. (char-code #\\) + (aref result (+ 1 start)) #. (char-code #\N))) + + ;; either column separator, Tab, or Newline (end of record) + (setf (aref result (+ start len)) + (if (= bytes (+ start len 1)) + #. (char-code #\Newline) + #. (char-code #\Tab))))) + + ;; return the result and how many bytes it represents + (values result bytes))) + + ;; in the general case we need to take into account PostgreSQL + ;; escaping rules for the COPY format + (flet ((escaped-length (string) + (if (null string) + ;; NULL is \N (2 bytes) in COPY format + 2 + (loop :for byte :across string + :sum (case byte + (#. (char-code #\\) 2) + (#. (char-code #\Newline) 2) + (#. (char-code #\Return) 2) + (#. (char-code #\Tab) 2) + (#. (char-code #\Backspace) 2) + (#. (char-code #\Page) 2) + (t 1)))))) + (let* ((bytes (+ nbcols + (reduce '+ (map 'list #'escaped-length pgrow)))) + (result (make-array bytes :element-type '(unsigned-byte 8))) + (pos 0)) + (macrolet ((byte-out (byte) + `(progn (setf (aref result pos) ,byte) + (incf pos))) + (esc-byte-out (byte) + `(progn (setf (aref result pos) #. (char-code #\\)) + (setf (aref result (1+ pos)) ,byte) + (incf pos 2)))) + (loop :for col :across pgrow :do + (if (null col) + (esc-byte-out #. (char-code #\N)) + (loop :for byte :across col :do + (case byte (#. (char-code #\\) (esc-byte-out byte)) - (#. (char-code #\Space) (byte-out byte)) (#. (char-code #\Newline) (esc-byte-out byte)) (#. (char-code #\Return) (esc-byte-out byte)) (#. (char-code #\Tab) (esc-byte-out byte)) (#. (char-code #\Backspace) (esc-byte-out byte)) (#. (char-code #\Page) (esc-byte-out byte)) - (t (byte-out byte)))))) + (t (byte-out byte))))) - :when more? :do (byte-out #. (char-code #\Tab)) + ;; either column separator, Tab, or end-of-record with Newline + (if (= bytes (+ 1 pos)) + (byte-out #. (char-code #\Newline)) + (byte-out #. (char-code #\Tab))))) - :finally (progn (byte-out #. (char-code #\Newline)) - (return (values buffer bytes))))))) + ;; return the result and how many bytes it represents + (values result bytes))))))