Reduce memory allocation in format-vector-row.

This function is used on every bit of data we send down to PostgreSQL, so I
have good hopes of reducing its memory allocation having an impact on
loading times. In particular for sizeable data sets.
This commit is contained in:
Dimitri Fontaine 2017-06-27 15:31:49 +02:00
parent 46d6f339df
commit 7f737a5f55

View File

@ -12,6 +12,7 @@
;;; minimize data copying in certain cases, and we want to avoid a coerce
;;; call here.
;;;
(defun format-vector-row (row
&optional
(transforms (make-list (length row)))
@ -21,69 +22,92 @@
See http://www.postgresql.org/docs/9.2/static/sql-copy.html#AEN66609 for
details about the format, and format specs."
(declare (type simple-array row))
(let* ((bytes 0)
(buffer (make-array 64 ; a cache line worth of data
:element-type '(unsigned-byte 8)
:adjustable t
:fill-pointer 0)))
(macrolet ((byte-out (byte)
`(progn (vector-push-extend ,byte buffer)
(incf bytes)))
(esc-byte-out (byte)
`(progn (vector-push-extend #. (char-code #\\) buffer)
(vector-push-extend ,byte buffer)
(incf bytes 2))))
(loop
:with nbcols := (length row)
:for raw-col :across row
:for i :from 1
:for more? := (< i nbcols)
:for fn :in transforms
:for col := (if pre-formatted raw-col
(if fn (funcall fn raw-col) raw-col))
:do
(if (or (null col)
;; still accept postmodern :NULL in "preprocessed" data
(eq :NULL col))
(progn
;; NULL is expected as \N, two chars
(byte-out #. (char-code #\\))
(byte-out #. (char-code #\N)))
(if pre-formatted
(let* ((data
(cl-postgres-trivial-utf-8:string-to-utf-8-bytes col))
(len (length data))
(newbuffer
(make-array (+ bytes len)
:element-type '(unsigned-byte 8)
:adjustable t
:fill-pointer t)))
(replace newbuffer buffer :start1 0)
(replace newbuffer data :start1 bytes)
(setf buffer newbuffer)
(incf bytes len))
;; first prepare an array of transformed and properly encoded columns
(let* ((nbcols (length row))
(pgrow (make-array nbcols :element-type 'array)))
(loop :for raw-col :across row
:for i :from 0
:for fn :in transforms
:for col := (if pre-formatted raw-col
(if fn (funcall fn raw-col) raw-col))
:do (setf (aref pgrow i)
(if (or (null col) (eq :NULL col))
nil
(cl-postgres-trivial-utf-8:string-to-utf-8-bytes col))))
(loop
;; From PostgreSQL docs:
;;
;; In particular, the following characters must be preceded
;; by a backslash if they appear as part of a column value:
;; backslash itself, newline, carriage return, and the
;; current delimiter character.
:for byte
:across (cl-postgres-trivial-utf-8:string-to-utf-8-bytes col)
:do (case byte
;; now that we have all the columns, make a simple array out of them
(if pre-formatted
;; pre-formatted data means we can return it as it already is
(flet ((col-length (col)
;; NULL is \N (2 bytes) in COPY format
(if col (length col) 2)))
(let* ((bytes (+ nbcols (reduce '+ (map 'list #'col-length pgrow))))
(result (make-array bytes :element-type '(unsigned-byte 8))))
(loop :for start := 0 :then (+ start len 1)
:for col :across pgrow
:for len := (if col (length col) 2)
:do (progn
(if col
(replace result
(the (simple-array (unsigned-byte 8) (*)) col)
:start1 start)
;; insert \N for a null value
(setf (aref result start) #. (char-code #\\)
(aref result (+ 1 start)) #. (char-code #\N)))
;; either column separator, Tab, or Newline (end of record)
(setf (aref result (+ start len))
(if (= bytes (+ start len 1))
#. (char-code #\Newline)
#. (char-code #\Tab)))))
;; return the result and how many bytes it represents
(values result bytes)))
;; in the general case we need to take into account PostgreSQL
;; escaping rules for the COPY format
(flet ((escaped-length (string)
(if (null string)
;; NULL is \N (2 bytes) in COPY format
2
(loop :for byte :across string
:sum (case byte
(#. (char-code #\\) 2)
(#. (char-code #\Newline) 2)
(#. (char-code #\Return) 2)
(#. (char-code #\Tab) 2)
(#. (char-code #\Backspace) 2)
(#. (char-code #\Page) 2)
(t 1))))))
(let* ((bytes (+ nbcols
(reduce '+ (map 'list #'escaped-length pgrow))))
(result (make-array bytes :element-type '(unsigned-byte 8)))
(pos 0))
(macrolet ((byte-out (byte)
`(progn (setf (aref result pos) ,byte)
(incf pos)))
(esc-byte-out (byte)
`(progn (setf (aref result pos) #. (char-code #\\))
(setf (aref result (1+ pos)) ,byte)
(incf pos 2))))
(loop :for col :across pgrow :do
(if (null col)
(esc-byte-out #. (char-code #\N))
(loop :for byte :across col :do
(case byte
(#. (char-code #\\) (esc-byte-out byte))
(#. (char-code #\Space) (byte-out byte))
(#. (char-code #\Newline) (esc-byte-out byte))
(#. (char-code #\Return) (esc-byte-out byte))
(#. (char-code #\Tab) (esc-byte-out byte))
(#. (char-code #\Backspace) (esc-byte-out byte))
(#. (char-code #\Page) (esc-byte-out byte))
(t (byte-out byte))))))
(t (byte-out byte)))))
:when more? :do (byte-out #. (char-code #\Tab))
;; either column separator, Tab, or end-of-record with Newline
(if (= bytes (+ 1 pos))
(byte-out #. (char-code #\Newline))
(byte-out #. (char-code #\Tab)))))
:finally (progn (byte-out #. (char-code #\Newline))
(return (values buffer bytes)))))))
;; return the result and how many bytes it represents
(values result bytes))))))