mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-05 02:46:10 +02:00
Reduce memory allocation in format-vector-row.
This function is used on every bit of data we send down to PostgreSQL, so I have good hopes of reducing its memory allocation having an impact on loading times. In particular for sizeable data sets.
This commit is contained in:
parent
46d6f339df
commit
7f737a5f55
@ -12,6 +12,7 @@
|
||||
;;; minimize data copying in certain cases, and we want to avoid a coerce
|
||||
;;; call here.
|
||||
;;;
|
||||
|
||||
(defun format-vector-row (row
|
||||
&optional
|
||||
(transforms (make-list (length row)))
|
||||
@ -21,69 +22,92 @@
|
||||
See http://www.postgresql.org/docs/9.2/static/sql-copy.html#AEN66609 for
|
||||
details about the format, and format specs."
|
||||
(declare (type simple-array row))
|
||||
(let* ((bytes 0)
|
||||
(buffer (make-array 64 ; a cache line worth of data
|
||||
:element-type '(unsigned-byte 8)
|
||||
:adjustable t
|
||||
:fill-pointer 0)))
|
||||
(macrolet ((byte-out (byte)
|
||||
`(progn (vector-push-extend ,byte buffer)
|
||||
(incf bytes)))
|
||||
(esc-byte-out (byte)
|
||||
`(progn (vector-push-extend #. (char-code #\\) buffer)
|
||||
(vector-push-extend ,byte buffer)
|
||||
(incf bytes 2))))
|
||||
(loop
|
||||
:with nbcols := (length row)
|
||||
:for raw-col :across row
|
||||
:for i :from 1
|
||||
:for more? := (< i nbcols)
|
||||
:for fn :in transforms
|
||||
:for col := (if pre-formatted raw-col
|
||||
(if fn (funcall fn raw-col) raw-col))
|
||||
:do
|
||||
(if (or (null col)
|
||||
;; still accept postmodern :NULL in "preprocessed" data
|
||||
(eq :NULL col))
|
||||
(progn
|
||||
;; NULL is expected as \N, two chars
|
||||
(byte-out #. (char-code #\\))
|
||||
(byte-out #. (char-code #\N)))
|
||||
|
||||
(if pre-formatted
|
||||
(let* ((data
|
||||
(cl-postgres-trivial-utf-8:string-to-utf-8-bytes col))
|
||||
(len (length data))
|
||||
(newbuffer
|
||||
(make-array (+ bytes len)
|
||||
:element-type '(unsigned-byte 8)
|
||||
:adjustable t
|
||||
:fill-pointer t)))
|
||||
(replace newbuffer buffer :start1 0)
|
||||
(replace newbuffer data :start1 bytes)
|
||||
(setf buffer newbuffer)
|
||||
(incf bytes len))
|
||||
;; first prepare an array of transformed and properly encoded columns
|
||||
(let* ((nbcols (length row))
|
||||
(pgrow (make-array nbcols :element-type 'array)))
|
||||
(loop :for raw-col :across row
|
||||
:for i :from 0
|
||||
:for fn :in transforms
|
||||
:for col := (if pre-formatted raw-col
|
||||
(if fn (funcall fn raw-col) raw-col))
|
||||
:do (setf (aref pgrow i)
|
||||
(if (or (null col) (eq :NULL col))
|
||||
nil
|
||||
(cl-postgres-trivial-utf-8:string-to-utf-8-bytes col))))
|
||||
|
||||
(loop
|
||||
;; From PostgreSQL docs:
|
||||
;;
|
||||
;; In particular, the following characters must be preceded
|
||||
;; by a backslash if they appear as part of a column value:
|
||||
;; backslash itself, newline, carriage return, and the
|
||||
;; current delimiter character.
|
||||
:for byte
|
||||
:across (cl-postgres-trivial-utf-8:string-to-utf-8-bytes col)
|
||||
:do (case byte
|
||||
;; now that we have all the columns, make a simple array out of them
|
||||
(if pre-formatted
|
||||
;; pre-formatted data means we can return it as it already is
|
||||
(flet ((col-length (col)
|
||||
;; NULL is \N (2 bytes) in COPY format
|
||||
(if col (length col) 2)))
|
||||
(let* ((bytes (+ nbcols (reduce '+ (map 'list #'col-length pgrow))))
|
||||
(result (make-array bytes :element-type '(unsigned-byte 8))))
|
||||
(loop :for start := 0 :then (+ start len 1)
|
||||
:for col :across pgrow
|
||||
:for len := (if col (length col) 2)
|
||||
:do (progn
|
||||
(if col
|
||||
(replace result
|
||||
(the (simple-array (unsigned-byte 8) (*)) col)
|
||||
:start1 start)
|
||||
;; insert \N for a null value
|
||||
(setf (aref result start) #. (char-code #\\)
|
||||
(aref result (+ 1 start)) #. (char-code #\N)))
|
||||
|
||||
;; either column separator, Tab, or Newline (end of record)
|
||||
(setf (aref result (+ start len))
|
||||
(if (= bytes (+ start len 1))
|
||||
#. (char-code #\Newline)
|
||||
#. (char-code #\Tab)))))
|
||||
|
||||
;; return the result and how many bytes it represents
|
||||
(values result bytes)))
|
||||
|
||||
;; in the general case we need to take into account PostgreSQL
|
||||
;; escaping rules for the COPY format
|
||||
(flet ((escaped-length (string)
|
||||
(if (null string)
|
||||
;; NULL is \N (2 bytes) in COPY format
|
||||
2
|
||||
(loop :for byte :across string
|
||||
:sum (case byte
|
||||
(#. (char-code #\\) 2)
|
||||
(#. (char-code #\Newline) 2)
|
||||
(#. (char-code #\Return) 2)
|
||||
(#. (char-code #\Tab) 2)
|
||||
(#. (char-code #\Backspace) 2)
|
||||
(#. (char-code #\Page) 2)
|
||||
(t 1))))))
|
||||
(let* ((bytes (+ nbcols
|
||||
(reduce '+ (map 'list #'escaped-length pgrow))))
|
||||
(result (make-array bytes :element-type '(unsigned-byte 8)))
|
||||
(pos 0))
|
||||
(macrolet ((byte-out (byte)
|
||||
`(progn (setf (aref result pos) ,byte)
|
||||
(incf pos)))
|
||||
(esc-byte-out (byte)
|
||||
`(progn (setf (aref result pos) #. (char-code #\\))
|
||||
(setf (aref result (1+ pos)) ,byte)
|
||||
(incf pos 2))))
|
||||
(loop :for col :across pgrow :do
|
||||
(if (null col)
|
||||
(esc-byte-out #. (char-code #\N))
|
||||
(loop :for byte :across col :do
|
||||
(case byte
|
||||
(#. (char-code #\\) (esc-byte-out byte))
|
||||
(#. (char-code #\Space) (byte-out byte))
|
||||
(#. (char-code #\Newline) (esc-byte-out byte))
|
||||
(#. (char-code #\Return) (esc-byte-out byte))
|
||||
(#. (char-code #\Tab) (esc-byte-out byte))
|
||||
(#. (char-code #\Backspace) (esc-byte-out byte))
|
||||
(#. (char-code #\Page) (esc-byte-out byte))
|
||||
(t (byte-out byte))))))
|
||||
(t (byte-out byte)))))
|
||||
|
||||
:when more? :do (byte-out #. (char-code #\Tab))
|
||||
;; either column separator, Tab, or end-of-record with Newline
|
||||
(if (= bytes (+ 1 pos))
|
||||
(byte-out #. (char-code #\Newline))
|
||||
(byte-out #. (char-code #\Tab)))))
|
||||
|
||||
:finally (progn (byte-out #. (char-code #\Newline))
|
||||
(return (values buffer bytes)))))))
|
||||
;; return the result and how many bytes it represents
|
||||
(values result bytes))))))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user