Implement generic field to columns projection with support for user code.

And use that new facility in the Archive mode of operations, which is a
practical unit of testing and milestone to reach. We have enough code to
actually compile and try running the parser's output.
This commit is contained in:
Dimitri Fontaine 2013-09-25 00:01:46 +02:00
parent b4e530981c
commit 6e3767bb37
5 changed files with 265 additions and 112 deletions

View File

@ -13,7 +13,35 @@
(defun http-fetch-file (url &key (tmpdir *default-tmpdir*))
"Download a file from URL into TMPDIR."
(ensure-directories-exist tmpdir)
(drakma:http-request url :force-binary t))
(let ((archive-filename (make-pathname :directory (namestring tmpdir)
:name (pathname-name url)
:type (pathname-type url))))
(multiple-value-bind (http-stream
status-code
headers
uri
stream
should-close
status)
(drakma:http-request url :force-binary t :want-stream t)
;; TODO: check the status-code
(declare (ignore status-code uri stream status))
(let* ((source-stream (flexi-streams:flexi-stream-stream http-stream))
(content-length
(parse-integer (cdr (assoc :content-length headers)))))
(with-open-file (archive-stream archive-filename
:direction :output
:element-type '(unsigned-byte 8)
:if-exists :supersede
:if-does-not-exist :create)
(let ((seq (make-array content-length
:element-type '(unsigned-byte 8)
:fill-pointer t)))
(setf (fill-pointer seq) (read-sequence seq source-stream))
(write-sequence seq archive-stream)))
(when should-close (close source-stream))))
;; return the pathname where we just downloaded the file
archive-filename))
(defun expand-archive (archive-file &key (tmpdir *default-tmpdir*))
"Expand given ARCHIVE-FILE in TMPDIR/(pathname-name ARCHIVE-FILE). Return
@ -25,8 +53,20 @@
(fad:pathname-as-directory (merge-pathnames archive-name tmpdir))))
(ensure-directories-exist expand-directory)
(ecase archive-type
(:zip (zip:unzip archive-file expand-directory)))))
(:zip (zip:unzip archive-file expand-directory)))
;; return the pathname where we did expand the archive
expand-directory))
(defun get-matching-filenames (directory regex)
"Apply given REGEXP to the DIRECTORY contents and return the list of
matching files."
(let ((matches nil)
(start (length (namestring directory))))
(flet ((push-matches (pathname)
(when (cl-ppcre:scan regex (namestring pathname) :start start)
(push pathname matches))))
(fad:walk-directory directory #'push-matches))
matches))
;;;

141
csv.lisp
View File

@ -12,13 +12,67 @@
:name table-name
:type "csv"))
(defun get-absolute-pathname (pathname-or-regex &key (root *csv-path-root*))
"PATHNAME-OR-REGEX is expected to be either (:regexp expression)
or (:filename pathname). In the first case, this fonction check if the
pathname is absolute or relative and returns an absolute pathname given
current working directory of ROOT.
In the second case, walk the ROOT directory and return the first pathname
that matches the regex. TODO: consider signaling a condition when we have
more than one match."
(destructuring-bind (type part) pathname-or-regex
(ecase type
(:regex (first (pgloader.archive:get-matching-filenames root part)))
(:filename (if (fad:pathname-absolute-p part) part
(merge-pathnames part root))))))
;;;
;;; Project fields into columns
;;;
(defun project-fields (&key fields columns null-as)
"The simplest projection happens when both FIELDS and COLS are nil: in
this case the projection is an identity, we simply return what we got --
still with some basic processing.
Other forms of projections consist of forming columns with the result of
applying a transformation function. In that case a cols entry is a list
of '(colname type expression), the expression being the (already
compiled) function to use here."
(let ((null-as (if (eq null-as :blanks)
(lambda (col)
(every (lambda (char) (char= char #\Space)) col))
`(lambda (col)
(if (string= ,null-as col) nil col)))))
(if (or (null fields) (null columns))
`(lambda (row)
(mapcar ,null-as row))
(let* ((args
(mapcar
(lambda (field)
(pgloader.transforms:intern-symbol (car field))) fields))
(newrow
(loop for (name type fn) in columns
collect (or fn
(let ((binding
(pgloader.transforms:intern-symbol name)))
`(funcall ,null-as ,binding))))))
`(lambda (row)
(destructuring-bind (,@args) row
(list ,@newrow)))))))
;;;
;;; Read a file format in CSV format, and call given function on each line.
;;;
(defun map-rows (table-name filename
&key
process-row-fn
(skip-first-p nil)
fields
columns
(skip-lines nil)
(separator #\Tab)
(quote cl-csv:*quote*)
(escape cl-csv:*quote-escape*)
@ -41,16 +95,23 @@ Finally returns how many rows where read and processed."
(let* ((read 0)
(reformat-then-process
(lambda (row)
(incf read)
(let ((row-with-nils
(mapcar (lambda (x) (if (string= null-as x) nil x)) row)))
(funcall process-row-fn row-with-nils)))))
(compile nil
(lambda (row)
(incf read)
(let* ((processing (project-fields :fields fields
:columns columns
:null-as null-as))
(processed-row (funcall processing row)))
(funcall process-row-fn processed-row))))))
;; we handle skipping more than one line here, as cl-csv only knows
;; about skipping the first line
(when (and skip-lines (< 0 skip-lines))
(loop repeat skip-lines do (read-line input nil nil)))
(handler-case
(cl-csv:read-csv input
:row-fn reformat-then-process
:skip-first-p skip-first-p
:separator separator
:quote quote
:escape escape)
@ -63,7 +124,9 @@ Finally returns how many rows where read and processed."
(defun copy-to-queue (table-name filename dataq
&key
(skip-first-p nil)
fields
columns
skip-lines
(separator #\Tab)
(quote cl-csv:*quote*)
(escape cl-csv:*quote-escape*)
@ -71,39 +134,33 @@ Finally returns how many rows where read and processed."
"Copy data from CSV FILENAME into lprallel.queue DATAQ"
(let ((read
(pgloader.queue:map-push-queue dataq #'map-rows table-name filename
:skip-first-p skip-first-p
:fields fields
:columns columns
:skip-lines skip-lines
:separator separator
:quote quote
:escape escape
:null-as null-as)))
(pgstate-incf *state* table-name :read read)))
(defun copy-from-file (dbname table-name filename
(defun copy-from-file (dbname table-name filename-or-regex
&key
fields
columns
transforms
(truncate t)
(skip-first-p nil)
skip-lines
(separator #\Tab)
(quote cl-csv:*quote*)
(escape cl-csv:*quote-escape*)
(null-as "\\N"))
"Copy data from CSV file FILENAME into PostgreSQL DBNAME.TABLE-NAME"
(let* ((report-header (null *state*))
(*state* (or *state* (pgloader.utils:make-pgstate)))
(lp:*kernel*
(lp:make-kernel 2 :bindings
`((*pgconn-host* . ,*pgconn-host*)
(*pgconn-port* . ,*pgconn-port*)
(*pgconn-user* . ,*pgconn-user*)
(*pgconn-pass* . ,*pgconn-pass*)
(*pg-settings* . ',*pg-settings*)
(*myconn-host* . ,*myconn-host*)
(*myconn-port* . ,*myconn-port*)
(*myconn-user* . ,*myconn-user*)
(*myconn-pass* . ,*myconn-pass*)
(*state* . ,*state*))))
(channel (lp:make-channel))
(dataq (lq:make-queue :fixed-capacity 4096)))
(let* ((report-header (null *state*))
(*state* (or *state* (pgloader.utils:make-pgstate)))
(lp:*kernel* (make-kernel 2))
(channel (lp:make-channel))
(dataq (lq:make-queue :fixed-capacity 4096))
(filename (get-absolute-pathname filename-or-regex)))
;; statistics
(when report-header (report-header))
@ -112,23 +169,23 @@ Finally returns how many rows where read and processed."
(multiple-value-bind (res secs)
(timing
(lp:submit-task channel (lambda ()
;; this function update :read stats
(copy-to-queue table-name filename dataq
:skip-first-p skip-first-p
:separator separator
:quote quote
:escape escape
:null-as null-as)))
(lp:submit-task channel
;; this function update :read stats
#'copy-to-queue table-name filename dataq
:fields fields
:columns columns
:skip-lines skip-lines
:separator separator
:quote quote
:escape escape
:null-as null-as)
;; and start another task to push that data from the queue to PostgreSQL
(lp:submit-task
channel
(lambda ()
;; this function update :rows stats
(pgloader.pgsql:copy-from-queue dbname table-name dataq
:truncate truncate
:transforms transforms)))
(lp:submit-task channel
;; this function update :rows stats
#'pgloader.pgsql:copy-from-queue dbname table-name dataq
:truncate truncate
:transforms transforms)
;; now wait until both the tasks are over
(loop for tasks below 2 do (lp:receive-result channel))

View File

@ -37,14 +37,6 @@
#:camelCase-to-colname
#:make-kernel))
(defpackage #:pgloader.parser
(:use #:cl #:esrap #:pgloader.params)
(:import-from #:pgloader.pgsql
#:with-pgsql-transaction
#:pgsql-execute)
(:export #:parse-command
#:run-command))
(defpackage #:pgloader.queue
(:use #:cl)
(:export #:map-pop-queue
@ -65,6 +57,14 @@
#:get-date-columns
#:format-row))
(defpackage #:pgloader.parser
(:use #:cl #:esrap #:pgloader.params)
(:import-from #:pgloader.pgsql
#:with-pgsql-transaction
#:pgsql-execute)
(:export #:parse-command
#:run-command))
;;
;; Specific source handling
@ -97,6 +97,7 @@
(:export #:*default-tmpdir*
#:http-fetch-file
#:expand-archive
#:get-matching-filenames
#:import-csv-from-zip))
(defpackage #:pgloader.syslog

View File

@ -172,11 +172,63 @@ Here's a quick description of the format we're parsing here:
(def-keyword-rule "before")
(def-keyword-rule "finally")
(def-keyword-rule "and")
(def-keyword-rule "do"))
(def-keyword-rule "do")
(def-keyword-rule "filename")
(def-keyword-rule "matching"))
(defrule kw-auto-increment (and "auto_increment" (* (or #\Tab #\Space)))
(:constant :auto-increment))
;;;
;;; Regular Expression support, quoted as-you-like
;;;
(defun process-quoted-regex (pr)
"Helper function to process different kinds of quotes for regexps"
(destructuring-bind (open regex close) pr
(declare (ignore open close))
`(:regex ,(text regex))))
(defrule single-quoted-regex (and #\' (+ (not #\')) #\')
(:function process-quoted-regex))
(defrule double-quoted-regex (and #\" (+ (not #\")) #\")
(:function process-quoted-regex))
(defrule parens-quoted-regex (and #\( (+ (not #\))) #\))
(:function process-quoted-regex))
(defrule braces-quoted-regex (and #\{ (+ (not #\})) #\})
(:function process-quoted-regex))
(defrule chevron-quoted-regex (and #\< (+ (not #\>)) #\>)
(:function process-quoted-regex))
(defrule brackets-quoted-regex (and #\[ (+ (not #\])) #\])
(:function process-quoted-regex))
(defrule slash-quoted-regex (and #\/ (+ (not #\/)) #\/)
(:function process-quoted-regex))
(defrule pipe-quoted-regex (and #\| (+ (not #\|)) #\|)
(:function process-quoted-regex))
(defrule sharp-quoted-regex (and #\# (+ (not #\#)) #\#)
(:function process-quoted-regex))
(defrule quoted-regex (and "~" (or single-quoted-regex
double-quoted-regex
parens-quoted-regex
braces-quoted-regex
chevron-quoted-regex
brackets-quoted-regex
slash-quoted-regex
pipe-quoted-regex
sharp-quoted-regex))
(:lambda (qr)
(destructuring-bind (tilde regex) qr
(declare (ignore tilde))
regex)))
;;;
@ -860,11 +912,11 @@ Here's a quick description of the format we're parsing here:
(declare (ignore fields ))
sep)))
(defrule csv-option (and (or option-truncate
option-skip-header
option-fields-enclosed-by
option-fields-escaped-by
option-fields-terminated-by)))
(defrule csv-option (or option-truncate
option-skip-header
option-fields-enclosed-by
option-fields-escaped-by
option-fields-terminated-by))
(defrule another-csv-option (and #\, ignore-whitespace csv-option)
(:lambda (source)
@ -893,7 +945,7 @@ Here's a quick description of the format we're parsing here:
(cons :date-format date-format))))
(defrule option-null-if-blanks (and kw-null kw-if kw-blanks)
(:constant (cons :null-if-blanks t)))
(:constant (cons :null-as :blanks)))
(defrule csv-field-option (or option-terminated-by
option-date-format
@ -950,13 +1002,9 @@ Here's a quick description of the format we're parsing here:
(or (alphanumericp character)
(member character '(#\_ #\-))))
(defun intern-transforms-symbol (symbol-name)
(intern (string-upcase symbol-name)
(find-package "PGLOADER.TRANSFORMS")))
(defrule sexp-symbol (+ (symbol-character-p character))
(:lambda (schars)
(intern-transforms-symbol (text schars))))
(pgloader.transforms:intern-symbol (text schars))))
(defrule sexp-string-char (or (not-doublequote character) (and #\\ #\")))
@ -1020,14 +1068,23 @@ Here's a quick description of the format we're parsing here:
;;
;; The main command parsing
;;
(defrule csv-source (and kw-load kw-csv kw-from maybe-quoted-filename)
(defrule filename-matching (and kw-filename kw-matching quoted-regex)
(:lambda (fm)
(destructuring-bind (filename matching regex) fm
(declare (ignore filename matching))
regex)))
(defrule filename-or-regex (or filename-matching maybe-quoted-filename))
(defrule csv-source (and kw-load kw-csv kw-from filename-or-regex)
(:lambda (src)
(destructuring-bind (load csv from source) src
(declare (ignore load csv from))
;; source is (:filename #P"pathname/here")
(destructuring-bind (type uri) source
(ecase type
(:filename uri))))))
(:filename source)
(:regex source))))))
(defun list-symbols (expression &optional s)
"Return a list of the symbols used in EXPRESSION."
@ -1037,22 +1094,6 @@ Here's a quick description of the format we're parsing here:
finally (return (reverse s))))
(t s)))
(defun col-expr-to-lambda (fields cols)
"Transform expression into proper lambda definitions."
(loop
with args = (mapcar
(lambda (field) (intern-transforms-symbol (car field)))
fields)
for (name type expression) in cols
for fun = (when expression
(let ((args-not-used
(set-difference args (list-symbols expression))))
(compile nil `(lambda (,@args)
;; avoid warnings when compiling the command
(declare (ignore ,@args-not-used))
,expression))))
collect (list name type fun)))
(defrule load-csv-file (and csv-source (? csv-source-field-list)
target (? csv-target-column-list)
csv-options)
@ -1068,10 +1109,9 @@ Here's a quick description of the format we're parsing here:
(*pgconn-pass* ,password))
(pgloader.csv:copy-from-file ,dbname
,table-name
,source
',source
:fields ',fields
:cols ',(funcall #'col-expr-to-lambda
fields columns)
:columns ',columns
,@options)))))))
@ -1106,14 +1146,6 @@ Here's a quick description of the format we're parsing here:
(declare (ignore finally do))
quoted)))
(defrule archive-source (and kw-in kw-archive http-uri)
(:lambda (src)
(destructuring-bind (in archive source) src
(declare (ignore in archive))
(destructuring-bind (type url) source
(ecase type
(:http url))))))
(defrule archive-command (or load-csv-file
load-dbf-file))
@ -1128,20 +1160,37 @@ Here's a quick description of the format we're parsing here:
(destructuring-bind (col1 cols) source
(list* col1 cols))))
(defrule in-archive (and archive-source
before-load-do
archive-command-list
finally-do)
(defrule archive-source (and kw-load kw-from kw-archive http-uri)
(:lambda (src)
(destructuring-bind (load from archive source) src
(declare (ignore load from archive))
(destructuring-bind (type url) source
(ecase type
(:http url))))))
(defrule load-from-archive (and archive-source
target
before-load-do
archive-command-list
finally-do)
(:lambda (archive)
(destructuring-bind (source before commands finally) archive
`(lambda ()
(let* ((archive-file (pgloader.archive:http-fetch-file ,source))
(*csv-root-file* (pgloader.archive:expand-archive archive-file)))
(progn
(pgsql-execute ,before :client-min-messages :error)
,@(loop for command in commands
collect `(funcall ,command))
(pgsql-execute ,finally :client-min-messages :error)))))))
(destructuring-bind (source pg-db-uri before commands finally) archive
(destructuring-bind (&key host port user password dbname &allow-other-keys)
pg-db-uri
`(lambda ()
(let* ((archive-file (pgloader.archive:http-fetch-file ,source))
(*csv-path-root* (pgloader.archive:expand-archive archive-file))
(*pgconn-host* ,host)
(*pgconn-port* ,port)
(*pgconn-user* ,user)
(*pgconn-pass* ,password))
(progn
(with-pgsql-transaction (,dbname)
(pgsql-execute ,before :client-min-messages :error))
,@(loop for command in commands
collect `(funcall ,command))
(with-pgsql-transaction (,dbname)
(pgsql-execute ,finally :client-min-messages :error)))))))))
;;;
@ -1154,7 +1203,7 @@ Here's a quick description of the format we're parsing here:
(defrule end-of-command (and ignore-whitespace #\; ignore-whitespace)
(:constant :eoc))
(defrule command (and (or in-archive
(defrule command (and (or load-from-archive
load-csv-file
load-dbf-file
load-database
@ -1258,9 +1307,10 @@ LOAD FROM http:///tapoueh.org/db.t
fields terminated by '\t';
"))
(defun test-parsing-in-archive ()
(defun test-parsing-load-from-archive ()
(parse-command "
IN ARCHIVE http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip
LOAD FROM ARCHIVE http://pgsql.tapoueh.org/temp/foo.zip
INTO postgresql://dim@localhost:54393/dim
BEFORE LOAD DO
$$
@ -1284,7 +1334,7 @@ LOAD FROM http:///tapoueh.org/db.t
);
$$
LOAD CSV FROM '*/GeoLiteCity-Blocks.csv'
LOAD CSV FROM FILENAME MATCHING ~/GeoLiteCity-Blocks.csv/
(
startIpNum, endIpNum, locId
)
@ -1299,7 +1349,7 @@ LOAD FROM http:///tapoueh.org/db.t
fields escaped by '\"',
fields terminated by '\\t'
AND LOAD CSV FROM '*/GeoLiteCity-Location.csv'
AND LOAD CSV FROM FILENAME MATCHING ~/GeoLiteCity-Location.csv/
(
locId,country,region,city,postalCode,
latitude,longitude,metroCode,areaCode

View File

@ -6,10 +6,15 @@
;;; syntax for transformations.
(defpackage #:pgloader.transforms
(:use #:cl))
(:use #:cl)
(:export #:intern-symbol))
(in-package :pgloader.transforms)
(defun intern-symbol (symbol-name)
(intern (string-upcase symbol-name)
(find-package "PGLOADER.TRANSFORMS")))
(defun zero-dates-to-null (date-string)
"MySQL accepts '0000-00-00' as a date, we want :null instead."
(cond