Begin working on importing from zip files with plenty of wild guessing...

This commit is contained in:
Dimitri Fontaine 2013-08-19 23:38:58 +02:00
parent 6b7af735ae
commit f3b6054432
5 changed files with 117 additions and 3 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
galaxya
csv

69
archive.lisp Normal file
View File

@ -0,0 +1,69 @@
;;;
;;; Tools to handle archive files, like ZIP of CSV files
;;;
(in-package #:pgloader.archive)
(let (l)
(zip:with-zipfile
(z "/Users/dim/dev/CL/pgloader/test/lahman2012-csv.zip")
(zip:do-zipfile-entries (name x z) (push name l))) l)
(defun import-csv-from-zip (zip-filename
&key
(create-tables nil)
(truncate t)
(null-as ""))
"Parse a ZIP file found at FILENAME and import all the CSV files found.
We only try to import data from files named `*.zip`, and we consider that
the first line of such files are containing the names of the columns to
import."
(declare (ignore truncate))
(zip:with-zipfile (zip zip-filename)
(zip:do-zipfile-entries (filename entry zip)
(format t "file: ~a~%" filename)
(when (string= "csv" (pathname-type filename))
(flex:with-input-from-sequence
(stream (zip:zipfile-entry-contents entry))
(let* ((fmt (flex:make-external-format :utf-8 :eol-style :lf))
(u-stream (flex:make-flexi-stream stream :external-format fmt))
(header (read-line u-stream))
;; reconsider the format when the last char of the header
;; is actually #\Return
(header
(if (char= #\Return (aref header (- (length header) 1)))
(progn
(setf fmt
(flex:make-external-format :utf-8 :eol-style :crlf)
u-stream
(flex:make-flexi-stream stream :external-format fmt))
(string-right-trim (list #\Return) header))
header))
(first-data-line (read-line u-stream))
;;
;; to guess the separator from the header, find the most
;; frequent separator candidate
(sep-counts (loop
for sep in pgloader.csv::*separators*
collect (cons sep (count sep header))))
(separator (car
(first (sort sep-counts #'> :key #'cdr))))
;;
;; now get the column names
(col-names
(mapcar #'camelCase-to-colname
(split-sequence:split-sequence separator header))))
(format t " separator: ~a~% columns: ~a~%" separator col-names)
(when create-tables
(format t "CREATE TABLE ~a (~{~a~^, ~});~%"
(pathname-name filename) col-names))
(format t "first line: ~a~%" first-data-line)
;; then from the first line of data, guess the column data types
;; now create the table schema and begin importing the data
))))))

View File

@ -30,7 +30,8 @@
#:pgtable-reject-logs
#:report-pgtable-stats
#:report-pgstate-stats
#:slurp-file-into-string))
#:slurp-file-into-string
#:camelCase-to-colname))
(defpackage #:pgloader.transforms
(:use #:cl))
@ -71,6 +72,27 @@
#:guess-csv-params
#:guess-all-csv-params))
(defpackage #:pgloader.archive
(:use #:cl #:pgloader.params #:pgloader.csv)
(:import-from #:pgloader.utils
#:log-message
#:report-header
#:report-table-name
#:report-results
#:report-footer
#:format-interval
#:timing
#:make-pgstate
#:pgstate-get-table
#:pgstate-add-table
#:pgstate-setf
#:pgstate-incf
#:pgstate-decf
#:report-pgtable-stats
#:report-pgstate-stats
#:camelCase-to-colname)
(:export #:import-csv-from-zip))
(defpackage #:pgloader.mysql
(:use #:cl #:pgloader.params)
(:import-from #:pgloader.utils

View File

@ -16,6 +16,8 @@
#:lparallel ; threads, workers, queues
#:esrap ; parser generator
#:alexandria ; utils
#:zip ; support for zip archive files
#:flexi-streams ; streams
#:command-line-arguments ; for the main function
#:uiop ; portability layer (quit, argv, etc)
)
@ -26,8 +28,9 @@
;; those are one-package-per-file
(:file "parser" :depends-on ("package" "params"))
(:file "transforms")
(:file "queue" :depends-on ("package")) ; package pgloader.queue
(:file "csv" :depends-on ("package")) ; package pgloader.csv
(:file "queue" :depends-on ("package")) ; pgloader.queue
(:file "csv" :depends-on ("package")) ; pgloader.csv
(:file "archive" :depends-on ("package")) ; pgloader.archive
;; package pgloader.pgsql
(:file "pgsql" :depends-on ("package"

View File

@ -215,3 +215,22 @@
;; http://www.ymeme.com/slurping-a-file-common-lisp-83.html
(setf (fill-pointer seq) (read-sequence seq stream))
seq)))
;;;
;;; Camel Case converter
;;;
(defun camelCase-to-colname (string)
"Transform input STRING into a suitable column name.
lahmanID lahman_id
playerID player_id
birthYear birth_year"
(coerce
(loop
for first = t then nil
for char across string
for previous-upper-p = nil then char-upper-p
for char-upper-p = (eq char (char-upcase char))
for new-word = (and (not first) char-upper-p (not previous-upper-p))
when (and new-word (not (char= char #\_))) collect #\_
collect (char-downcase char))
'string))