From f3b60544327007429daf6a4e82de3a52e1968aa1 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Mon, 19 Aug 2013 23:38:58 +0200 Subject: [PATCH] Begin working on importing from zip files with plenty of wild guessing... --- .gitignore | 1 + archive.lisp | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ package.lisp | 24 +++++++++++++++++- pgloader.asd | 7 ++++-- utils.lisp | 19 +++++++++++++++ 5 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 archive.lisp diff --git a/.gitignore b/.gitignore index c3f4517..ebe21a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ +galaxya csv \ No newline at end of file diff --git a/archive.lisp b/archive.lisp new file mode 100644 index 0000000..3004658 --- /dev/null +++ b/archive.lisp @@ -0,0 +1,69 @@ +;;; +;;; Tools to handle archive files, like ZIP of CSV files +;;; + +(in-package #:pgloader.archive) + +(let (l) + (zip:with-zipfile + (z "/Users/dim/dev/CL/pgloader/test/lahman2012-csv.zip") + (zip:do-zipfile-entries (name x z) (push name l))) l) + +(defun import-csv-from-zip (zip-filename + &key + (create-tables nil) + (truncate t) + (null-as "")) + "Parse a ZIP file found at FILENAME and import all the CSV files found. + + We only try to import data from files named `*.zip`, and we consider that + the first line of such files are containing the names of the columns to + import." + (declare (ignore truncate)) + (zip:with-zipfile (zip zip-filename) + (zip:do-zipfile-entries (filename entry zip) + (format t "file: ~a~%" filename) + (when (string= "csv" (pathname-type filename)) + (flex:with-input-from-sequence + (stream (zip:zipfile-entry-contents entry)) + (let* ((fmt (flex:make-external-format :utf-8 :eol-style :lf)) + (u-stream (flex:make-flexi-stream stream :external-format fmt)) + (header (read-line u-stream)) + ;; reconsider the format when the last char of the header + ;; is actually #\Return + (header + (if (char= #\Return (aref header (- (length header) 1))) + (progn + (setf fmt + (flex:make-external-format :utf-8 :eol-style :crlf) + u-stream + (flex:make-flexi-stream stream :external-format fmt)) + (string-right-trim (list #\Return) header)) + header)) + (first-data-line (read-line u-stream)) + ;; + ;; to guess the separator from the header, find the most + ;; frequent separator candidate + (sep-counts (loop + for sep in pgloader.csv::*separators* + collect (cons sep (count sep header)))) + (separator (car + (first (sort sep-counts #'> :key #'cdr)))) + ;; + ;; now get the column names + (col-names + (mapcar #'camelCase-to-colname + (split-sequence:split-sequence separator header)))) + + (format t " separator: ~a~% columns: ~a~%" separator col-names) + (when create-tables + (format t "CREATE TABLE ~a (~{~a~^, ~});~%" + (pathname-name filename) col-names)) + (format t "first line: ~a~%" first-data-line) + + ;; then from the first line of data, guess the column data types + + ;; now create the table schema and begin importing the data + + )))))) + diff --git a/package.lisp b/package.lisp index 3527967..777f634 100644 --- a/package.lisp +++ b/package.lisp @@ -30,7 +30,8 @@ #:pgtable-reject-logs #:report-pgtable-stats #:report-pgstate-stats - #:slurp-file-into-string)) + #:slurp-file-into-string + #:camelCase-to-colname)) (defpackage #:pgloader.transforms (:use #:cl)) @@ -71,6 +72,27 @@ #:guess-csv-params #:guess-all-csv-params)) +(defpackage #:pgloader.archive + (:use #:cl #:pgloader.params #:pgloader.csv) + (:import-from #:pgloader.utils + #:log-message + #:report-header + #:report-table-name + #:report-results + #:report-footer + #:format-interval + #:timing + #:make-pgstate + #:pgstate-get-table + #:pgstate-add-table + #:pgstate-setf + #:pgstate-incf + #:pgstate-decf + #:report-pgtable-stats + #:report-pgstate-stats + #:camelCase-to-colname) + (:export #:import-csv-from-zip)) + (defpackage #:pgloader.mysql (:use #:cl #:pgloader.params) (:import-from #:pgloader.utils diff --git a/pgloader.asd b/pgloader.asd index b70d0ff..9ab417a 100644 --- a/pgloader.asd +++ b/pgloader.asd @@ -16,6 +16,8 @@ #:lparallel ; threads, workers, queues #:esrap ; parser generator #:alexandria ; utils + #:zip ; support for zip archive files + #:flexi-streams ; streams #:command-line-arguments ; for the main function #:uiop ; portability layer (quit, argv, etc) ) @@ -26,8 +28,9 @@ ;; those are one-package-per-file (:file "parser" :depends-on ("package" "params")) (:file "transforms") - (:file "queue" :depends-on ("package")) ; package pgloader.queue - (:file "csv" :depends-on ("package")) ; package pgloader.csv + (:file "queue" :depends-on ("package")) ; pgloader.queue + (:file "csv" :depends-on ("package")) ; pgloader.csv + (:file "archive" :depends-on ("package")) ; pgloader.archive ;; package pgloader.pgsql (:file "pgsql" :depends-on ("package" diff --git a/utils.lisp b/utils.lisp index 4b1f004..0ae918d 100644 --- a/utils.lisp +++ b/utils.lisp @@ -215,3 +215,22 @@ ;; http://www.ymeme.com/slurping-a-file-common-lisp-83.html (setf (fill-pointer seq) (read-sequence seq stream)) seq))) + +;;; +;;; Camel Case converter +;;; +(defun camelCase-to-colname (string) + "Transform input STRING into a suitable column name. + lahmanID lahman_id + playerID player_id + birthYear birth_year" + (coerce + (loop + for first = t then nil + for char across string + for previous-upper-p = nil then char-upper-p + for char-upper-p = (eq char (char-upcase char)) + for new-word = (and (not first) char-upper-p (not previous-upper-p)) + when (and new-word (not (char= char #\_))) collect #\_ + collect (char-downcase char)) + 'string))