Cleanup the package definitions and inter-dependancy, and implement a proper data structure for sharing stats about what's happening.

This commit is contained in:
Dimitri Fontaine 2013-02-07 21:55:10 +01:00
parent 08f7791f66
commit f6bdf64c7c
10 changed files with 248 additions and 155 deletions

View File

@ -2,11 +2,6 @@
;;; Tools to handle MySQL data fetching ;;; Tools to handle MySQL data fetching
;;; ;;;
(defpackage #:pgloader.csv
(:use #:cl)
(:export #:*csv-path-root*
#:get-pathname))
(in-package :pgloader.csv) (in-package :pgloader.csv)
(defparameter *csv-path-root* (defparameter *csv-path-root*

View File

@ -2,22 +2,6 @@
;;; Tools to handle MySQL data fetching ;;; Tools to handle MySQL data fetching
;;; ;;;
(defpackage #:pgloader.mysql
(:use #:cl)
(:import-from #:pgloader
#:*loader-kernel*
#:*myconn-host*
#:*myconn-user*
#:*myconn-pass*)
(:export #:map-rows
#:copy-from
#:list-databases
#:list-tables
#:export-all-tables
#:export-import-database
#:stream-mysql-table-in-pgsql
#:stream-database-tables))
(in-package :pgloader.mysql) (in-package :pgloader.mysql)
;;; ;;;
@ -121,7 +105,7 @@
(let ((pgtables (pgloader.pgsql:list-tables dbname)) (let ((pgtables (pgloader.pgsql:list-tables dbname))
(total-count 0) (total-count 0)
(total-seconds 0)) (total-seconds 0))
(pgloader.utils:report-header) (report-header)
(loop (loop
for table-name in (list-tables dbname for table-name in (list-tables dbname
:host host :host host
@ -131,18 +115,18 @@
when (or (null only-tables) when (or (null only-tables)
(member table-name only-tables :test #'equal)) (member table-name only-tables :test #'equal))
do do
(pgloader.utils:report-table-name table-name) (report-table-name table-name)
(multiple-value-bind (result seconds) (multiple-value-bind (result seconds)
(pgloader.utils:timing (timing
(copy-from dbname table-name filename (copy-from dbname table-name filename
:date-columns (pgloader.pgsql:get-date-columns :date-columns (pgloader.pgsql:get-date-columns
table-name pgtables))) table-name pgtables)))
(when result (when result
(incf total-count result) (incf total-count result)
(incf total-seconds seconds) (incf total-seconds seconds)
(pgloader.utils:report-results result seconds))) (report-results result seconds)))
finally finally
(pgloader.utils:report-footer "Total export time" (report-footer "Total export time"
total-count total-seconds) total-count total-seconds)
(return (values total-count (float total-seconds)))))) (return (values total-count (float total-seconds))))))
@ -160,17 +144,17 @@
(let ((mysql-tables (list-tables dbname)) (let ((mysql-tables (list-tables dbname))
(total-count 0) (total-count 0)
(total-seconds 0)) (total-seconds 0))
(pgloader.utils:report-header) (report-header)
(loop (loop
for (table-name . date-columns) in (pgloader.pgsql:list-tables dbname) for (table-name . date-columns) in (pgloader.pgsql:list-tables dbname)
when (or (null only-tables) when (or (null only-tables)
(member table-name only-tables :test #'equal)) (member table-name only-tables :test #'equal))
do do
(pgloader.utils:report-table-name table-name) (report-table-name table-name)
(if (member table-name mysql-tables :test #'equal) (if (member table-name mysql-tables :test #'equal)
(multiple-value-bind (result seconds) (multiple-value-bind (result seconds)
(pgloader.utils:timing (timing
(let ((filename (let ((filename
(pgloader.csv:get-pathname dbname table-name))) (pgloader.csv:get-pathname dbname table-name)))
;; export from MySQL to file ;; export from MySQL to file
@ -182,11 +166,11 @@
(when result (when result
(incf total-count result) (incf total-count result)
(incf total-seconds seconds) (incf total-seconds seconds)
(pgloader.utils:report-results result seconds))) (report-results result seconds)))
;; not a known mysql table ;; not a known mysql table
(format t " skip, unknown table in MySQL database~%")) (format t " skip, unknown table in MySQL database~%"))
finally finally
(pgloader.utils:report-footer "Total export+import time" (report-footer "Total export+import time"
total-count total-seconds) total-count total-seconds)
(return (values total-count (float total-seconds)))))) (return (values total-count (float total-seconds))))))
@ -205,27 +189,21 @@
(dataq (lq:make-queue 4096))) (dataq (lq:make-queue 4096)))
;; have a task fill MySQL data in the queue ;; have a task fill MySQL data in the queue
(lp:submit-task (lp:submit-task
channel (lambda () channel
(list :mysql (lambda ()
(pgloader.queue:map-push-queue (pgloader.queue:map-push-queue
dataq dataq #'map-rows dbname table-name)))
#'map-rows dbname table-name))))
;; and start another task to push that data from the queue to PostgreSQL ;; and start another task to push that data from the queue to PostgreSQL
(lp:submit-task (lp:submit-task
channel channel
(lambda () (lambda ()
(list :pgsql (pgloader.pgsql:copy-from-queue dbname table-name dataq
(multiple-value-list :truncate truncate
(pgloader.pgsql:copy-from-queue dbname table-name dataq :date-columns date-columns)))
:truncate truncate
:date-columns date-columns)))))
;; now wait until both the tasks are over ;; now wait until both the tasks are over
(loop (loop for tasks below 2 do (lp:receive-result channel))))
for tasks below 2
collect (lp:receive-result channel) into counts
finally (return (cadr (assoc :pgsql counts))))))
;;; ;;;
;;; Work on all tables for given database ;;; Work on all tables for given database
@ -233,36 +211,30 @@
(defun stream-database-tables (dbname &key (truncate t) only-tables) (defun stream-database-tables (dbname &key (truncate t) only-tables)
"Export MySQL data and Import it into PostgreSQL" "Export MySQL data and Import it into PostgreSQL"
;; get the list of tables and have at it ;; get the list of tables and have at it
(let ((mysql-tables (list-tables dbname)) (let ((mysql-tables (list-tables dbname)))
(total-count 0)
(total-errors 0) (report-header)
(total-seconds 0))
(pgloader.utils:report-header)
(loop (loop
for (table-name . date-columns) in (pgloader.pgsql:list-tables dbname) for (table-name . date-columns) in (pgloader.pgsql:list-tables dbname)
when (or (null only-tables) when (or (null only-tables)
(member table-name only-tables :test #'equal)) (member table-name only-tables :test #'equal))
do do
(pgloader.utils:report-table-name table-name) (pgstate-add-table *state* dbname table-name)
(report-table-name table-name)
(if (member table-name mysql-tables :test #'equal) (if (member table-name mysql-tables :test #'equal)
(multiple-value-bind (result seconds) (multiple-value-bind (res secs)
(pgloader.utils:timing (timing
(destructuring-bind (rows errors) (stream-mysql-table-in-pgsql dbname table-name
(stream-mysql-table-in-pgsql dbname table-name :truncate truncate
:truncate truncate :date-columns date-columns))
:date-columns date-columns) ;; set the timing we just measured
(incf total-count rows) (declare (ignore res))
(incf total-errors errors) (pgstate-incf *state* table-name :secs secs)
(list rows errors))) (report-pgtable-stats *state* table-name))
;; time to report
(destructuring-bind (rows errors) result
(incf total-seconds seconds)
(pgloader.utils:report-results rows errors seconds)))
;; not a known mysql table ;; not a known mysql table
(format t "skip, unknown table in MySQL database~%")) (format t "skip, unknown table in MySQL database~%"))
finally finally
(pgloader.utils:report-footer "Total streaming time" (report-pgstate-stats *state* "Total streaming time"))))
total-count total-errors total-seconds)
(return (values total-count (float total-seconds))))))

View File

@ -1,7 +1,84 @@
;;;; package.lisp ;;;; package.lisp
;;;
;;; To avoid circular files dependencies, define all the packages here
;;;
(defpackage #:pgloader.utils
(:use #:cl)
(:export #:report-header
#:report-table-name
#:report-results
#:report-footer
#:format-interval
#:timing
#:make-pgstate
#:pgstate-get-table
#:pgstate-add-table
#:pgstate-setf
#:pgstate-incf
#:report-pgtable-stats
#:report-pgstate-stats))
(defpackage #:pgloader.queue
(:use #:cl)
(:export #:map-pop-queue
#:map-push-queue))
(defpackage #:pgloader.csv
(:use #:cl)
(:export #:*csv-path-root*
#:get-pathname))
(defpackage #:pgloader.mysql
(:use #:cl)
(:import-from #:pgloader
#:*loader-kernel*
#:*myconn-host*
#:*myconn-user*
#:*myconn-pass*)
(:import-from #:pgloader.utils
#:report-header
#:report-table-name
#:report-results
#:report-footer
#:format-interval
#:timing
#:make-pgstate
#:pgstate-get-table
#:pgstate-add-table
#:pgstate-setf
#:pgstate-incf
#:report-pgtable-stats
#:report-pgstate-stats)
(:import-from #:pgloader
#:*state*)
(:export #:map-rows
#:copy-from
#:list-databases
#:list-tables
#:export-all-tables
#:export-import-database
#:stream-mysql-table-in-pgsql
#:stream-database-tables))
(defpackage #:pgloader.pgsql (defpackage #:pgloader.pgsql
(:use #:cl) (:use #:cl)
(:import-from #:pgloader.utils
#:report-header
#:report-table-name
#:report-results
#:report-footer
#:format-interval
#:timing
#:make-pgstate
#:pgstate-get-table
#:pgstate-add-table
#:pgstate-setf
#:pgstate-incf
#:report-pgtable-stats
#:report-pgstate-stats)
(:import-from #:pgloader
#:*state*)
(:export #:truncate-table (:export #:truncate-table
#:copy-from-file #:copy-from-file
#:copy-from-queue #:copy-from-queue
@ -16,7 +93,8 @@
#:copy-from-file #:copy-from-file
#:list-databases #:list-databases
#:list-tables) #:list-tables)
(:export #:copy-from-file (:export #:*state*
#:copy-from-file
#:list-databases #:list-databases
#:list-tables)) #:list-tables))

View File

@ -14,19 +14,19 @@
#:lparallel) #:lparallel)
:components ((:file "package") :components ((:file "package")
(:file "utils" :depends-on ("package")) (:file "utils" :depends-on ("package"))
(:file "pgloader" :depends-on ("package")) (:file "pgloader" :depends-on ("package" "utils"))
;; those are one-package-per-file ;; those are one-package-per-file
(:file "queue") ; package pgloader.queue (:file "queue" :depends-on ("package")) ; package pgloader.queue
(:file "csv") ; package pgloader.csv (:file "csv" :depends-on ("package")) ; package pgloader.csv
;; package pgloader.pgsql ;; package pgloader.pgsql
(:file "pgsql" :depends-on ("queue" "utils")) (:file "pgsql" :depends-on ("package" "queue" "utils"))
;; mysql.lisp depends on pgsql.lisp to be able to export data ;; mysql.lisp depends on pgsql.lisp to be able to export data
;; from MySQL in the PostgreSQL format. ;; from MySQL in the PostgreSQL format.
;; ;;
;; package pgloader.mysql ;; package pgloader.mysql
(:file "mysql" :depends-on ("pgsql" "queue" "utils")))) (:file "mysql" :depends-on ("package" "pgsql" "queue" "utils"))))

View File

@ -15,6 +15,12 @@
(defparameter *myconn-user* "myuser") (defparameter *myconn-user* "myuser")
(defparameter *myconn-pass* "mypass") (defparameter *myconn-pass* "mypass")
(defparameter *state* (pgloader.utils:make-pgstate)
"pgloader state, global stats and per-table stats")
;;; ;;;
;;; TODO: define a top level API ;;; TODO: define a top level API
;;; ;;;
(defparameter *state* (pgloader.utils:make-pgstate)
"State of the current loading.")

View File

@ -190,19 +190,25 @@ Finally returns how many rows where read and processed."
&key &key
(truncate t) (truncate t)
date-columns) date-columns)
"Fetch data from the QUEUE until we see :end-of-data" "Fetch data from the QUEUE until we see :end-of-data. Update *state*"
(when truncate (truncate-table dbname table-name)) (when truncate (truncate-table dbname table-name))
(let* ((conspec (remove :port (get-connection-string dbname))) (let* ((conspec (remove :port (get-connection-string dbname))))
(total-errors 0))
(loop (loop
for retval = for retval =
;; The idea is to stream the queue content directly into the
;; PostgreSQL COPY protocol stream, but COMMIT every
;; *copy-batch-size* rows.
;;
;; That allows to have to recover from a buffer of data only rather
;; than restart from scratch each time we have to find which row
;; contains erroneous data. BATCH is that buffer.
(let* ((stream (cl-postgres:open-db-writer conspec table-name nil)) (let* ((stream (cl-postgres:open-db-writer conspec table-name nil))
(batch nil) (batch nil)
(batch-size 0) (batch-size 0)
(process-row-fn (process-row-fn
;; build our batch aware row processing function ;; build our batch aware row processing function
;; it closes over batch and stream ;; it closes over stream, batch and batch-size
(lambda (row) (lambda (row)
(let ((reformated-row (let ((reformated-row
(reformat-row row :date-columns date-columns))) (reformat-row row :date-columns date-columns)))
@ -221,14 +227,13 @@ Finally returns how many rows where read and processed."
((or ((or
CL-POSTGRES-ERROR:UNIQUE-VIOLATION CL-POSTGRES-ERROR:UNIQUE-VIOLATION
CL-POSTGRES-ERROR:DATA-EXCEPTION) (condition) CL-POSTGRES-ERROR:DATA-EXCEPTION) (condition)
(incf total-errors (retry-batch dbname table-name (nreverse batch) batch-size)))))
(retry-batch dbname table-name
(nreverse batch) batch-size))))))
;; the final return value is the number of row processed ;; fetch how many rows we just pushed through, update stats
summing (if (consp retval) (cdr retval) retval) into total-rows for rows = (if (consp retval) (cdr retval) retval)
while (and (consp retval) (eq (car retval) :continue)) for cont = (and (consp retval) (eq (car retval) :continue))
finally (return (values total-rows total-errors))))) do (pgstate-incf *state* table-name :rows rows)
while cont)))
;;; ;;;
;;; When a batch has been refused by PostgreSQL with a data-exception, that ;;; When a batch has been refused by PostgreSQL with a data-exception, that
@ -250,12 +255,20 @@ Finally returns how many rows where read and processed."
;;; ;;;
(defun process-bad-row (dbname table-name condition row) (defun process-bad-row (dbname table-name condition row)
"Process bad row" "Process bad row"
;; first, the stats.
(pgstate-incf *state* table-name :errs 1)
;; now, the bad row processing
(let* ((str (format nil "~a" row)) (let* ((str (format nil "~a" row))
(str (if (< 72 (length str)) (subseq str 0 72) (str (if (< 72 (length str)) (subseq str 0 72)
str))) str)))
(format t "ERROR: ~a~%" condition) (format t "ERROR: ~a~%" condition)
(format t "DATA: ~a...~%" str))) (format t "DATA: ~a...~%" str)))
;;;
;;; Compute the next batch size, must be smaller than the previous one or
;;; just one row to ensure the retry-batch recursion is not infinite.
;;;
(defun smaller-batch-size (batch-size processed-rows) (defun smaller-batch-size (batch-size processed-rows)
"How many rows should we process in next iteration?" "How many rows should we process in next iteration?"
(let ((remaining-rows (- batch-size processed-rows))) (let ((remaining-rows (- batch-size processed-rows)))
@ -265,9 +278,11 @@ Finally returns how many rows where read and processed."
(min remaining-rows (min remaining-rows
(floor (/ batch-size *copy-batch-split*)))))) (floor (/ batch-size *copy-batch-split*))))))
;;;
;;; The recursive retry batch function.
;;;
(defun retry-batch (dbname table-name batch batch-size) (defun retry-batch (dbname table-name batch batch-size)
"Batch is a list of rows containing at least one error. Return number of "Batch is a list of rows containing at least one bad row. Find it."
bad rows."
(let* ((conspec (remove :port (get-connection-string dbname))) (let* ((conspec (remove :port (get-connection-string dbname)))
(current-batch-pos batch) (current-batch-pos batch)
(processed-rows 0) (processed-rows 0)
@ -282,15 +297,11 @@ Finally returns how many rows where read and processed."
(cl-postgres:open-db-writer conspec table-name nil))) (cl-postgres:open-db-writer conspec table-name nil)))
(unwind-protect (unwind-protect
(progn (dotimes (i current-batch-size)
(dotimes (i current-batch-size) ;; rows in that batch have already been processed
;; rows in that batch have already been processed (cl-postgres:db-write-row stream (car current-batch-pos))
(cl-postgres:db-write-row stream (car current-batch-pos)) (setf current-batch-pos (cdr current-batch-pos))
(setf current-batch-pos (cdr current-batch-pos)) (incf processed-rows))
(incf processed-rows))
;; function's return value: number of bad rows extracted
total-bad-rows)
(handler-case (handler-case
(cl-postgres:close-db-writer stream) (cl-postgres:close-db-writer stream)
@ -301,11 +312,8 @@ Finally returns how many rows where read and processed."
CL-POSTGRES-ERROR:DATA-EXCEPTION) (condition) CL-POSTGRES-ERROR:DATA-EXCEPTION) (condition)
;; process bad data ;; process bad data
(if (= 1 current-batch-size) (if (= 1 current-batch-size)
(progn (process-bad-row dbname table-name
(process-bad-row condition (car current-batch))
dbname table-name condition (car current-batch))
(incf total-bad-rows))
;; more than one line of bad data: recurse ;; more than one line of bad data: recurse
(retry-batch dbname table-name (retry-batch dbname table-name
current-batch current-batch-size)))))) current-batch current-batch-size)))))))))
finally (return total-bad-rows))))

View File

@ -1,13 +1,6 @@
;;; ;;;
;;; Tools to handle internal queueing, using lparallel.queue ;;; Tools to handle internal queueing, using lparallel.queue
;;; ;;;
(defpackage #:pgloader.queue
(:use #:cl)
(:export #:map-pop-queue
#:map-push-queue))
;; no nickname for that package, queue is far too generic
(in-package :pgloader.queue) (in-package :pgloader.queue)
(defun map-pop-queue (queue process-row-fn) (defun map-pop-queue (queue process-row-fn)

View File

@ -6,3 +6,6 @@
(setq *myconn-host* "localhost" (setq *myconn-host* "localhost"
*myconn-user* "debian-sys-maint" *myconn-user* "debian-sys-maint"
*myconn-pass* "vtmMI04yBZlFprYm") *myconn-pass* "vtmMI04yBZlFprYm")
;; start with a new empty state, for stats.
(setq pgloader:*state* (pgloader.utils:make-pgstate))

View File

@ -1,37 +0,0 @@
;;;
;;; Some little timing tools
;;;
(in-package :pgloader)
;;;
;;; Timing Macros
;;;
(defun elapsed-time-since (start)
"Return how many seconds ticked between START and now"
(/ (- (get-internal-real-time) start)
internal-time-units-per-second))
(defmacro timing (&body forms)
"return both how much real time was spend in body and its result"
(let ((start (gensym))
(end (gensym))
(result (gensym)))
`(let* ((,start (get-internal-real-time))
(,result (progn ,@forms))
(,end (get-internal-real-time)))
(values ,result (/ (- ,end ,start) internal-time-units-per-second)))))
(defun format-interval (seconds &optional (stream t))
"Output the number of seconds in a human friendly way"
(multiple-value-bind (years months days hours mins secs millisecs)
(date:decode-interval (date:encode-interval :second seconds))
(format
stream
"~:[~*~;~d years ~]~:[~*~;~d months ~]~:[~*~;~d days ~]~:[~*~;~dh~]~:[~*~;~dm~]~d.~ds"
(< 0 years) years
(< 0 months) months
(< 0 days) days
(< 0 hours) hours
(< 0 mins) mins
secs (truncate millisecs))))

View File

@ -1,18 +1,11 @@
;;; ;;;
;;; Random utilities ;;; Random utilities
;;; ;;;
(defpackage #:pgloader.utils
(:use #:cl)
(:export #:report-header
#:report-table-name
#:report-results
#:report-footer
#:format-interval
#:timing))
(in-package :pgloader.utils) (in-package :pgloader.utils)
(defparameter *reject-path-root*
(make-pathname :directory "/tmp"))
;;; ;;;
;;; Timing Macro ;;; Timing Macro
;;; ;;;
@ -48,6 +41,88 @@
(< 0 mins) mins (< 0 mins) mins
secs (truncate millisecs)))) secs (truncate millisecs))))
;;;
;;; Data Structures to maintain information about loading state
;;;
(defstruct pgtable
name
(read 0 :type fixnum) ; how many rows did we read
(rows 0 :type fixnum) ; how many rows did we write
(errs 0 :type fixnum) ; how many errors did we see
(secs 0.0 :type float) ; how many seconds did it take
reject-data reject-logs) ; files where to find reject data
(defstruct pgstate
(tables (make-hash-table :test 'equal))
(read 0 :type fixnum)
(rows 0 :type fixnum)
(errs 0 :type fixnum)
(secs 0.0 :type float))
(defun pgstate-get-table (pgstate name)
(gethash name (pgstate-tables pgstate)))
(defun pgstate-add-table (pgstate dbname table-name)
"Instanciate a new pgtable structure to hold our stats, and return it."
(or (pgstate-get-table pgstate table-name)
(let ((table (setf (gethash table-name (pgstate-tables pgstate))
(make-pgtable :name table-name))))
(setf (pgtable-reject-data table)
(make-pathname
:directory (pathname-directory
(merge-pathnames
(format nil "~a" dbname) *reject-path-root*))
:name table-name
:type "rej.dat")
(pgtable-reject-logs table)
(make-pathname
:directory (pathname-directory
(merge-pathnames
(format nil "~a" dbname) *reject-path-root*))
:name table-name
:type "rej.log"))
table)))
(defun pgstate-setf (pgstate name &key read rows errs secs)
(let ((pgtable (pgstate-get-table pgstate name)))
(when read
(setf (pgtable-read pgtable) read)
(incf (pgstate-read pgstate) read))
(when rows
(setf (pgtable-rows pgtable) rows)
(incf (pgstate-rows pgstate) rows))
(when errs
(setf (pgtable-errs pgtable) errs)
(incf (pgstate-errs pgstate) errs))
(when secs
(setf (pgtable-secs pgtable) secs)
(incf (pgstate-secs pgstate) secs))
pgtable))
(defun pgstate-incf (pgstate name &key rows errs secs)
(format t "~&pgstate-incf: ~d rows, ~d errs, ~f secs~%" rows errs secs)
(let ((pgtable (pgstate-get-table pgstate name)))
(when rows
(incf (pgtable-rows pgtable) rows)
(incf (pgstate-rows pgstate) rows))
(when errs
(incf (pgtable-errs pgtable) errs)
(incf (pgstate-errs pgstate) errs))
(when secs
(incf (pgtable-secs pgtable) secs)
(incf (pgstate-secs pgstate) secs))
pgtable))
(defun report-pgtable-stats (pgstate name)
(with-slots (rows errs secs) (pgstate-get-table pgstate name)
(format t "~9@a ~9@a ~9@a" rows errs (format-interval secs nil))))
(defun report-pgstate-stats (pgstate legend)
(with-slots (rows errs secs) pgstate
(format t "~&------------------------------ --------- --------- ---------")
(format t "~&~30@a ~9@a ~9@a ~9@a" legend
rows errs (format-interval secs nil))))
;;; ;;;
;;; Pretty print a report while doing bulk operations ;;; Pretty print a report while doing bulk operations
;;; ;;;