mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-05 10:56:10 +02:00
Begin working on importing from zip files with plenty of wild guessing...
This commit is contained in:
parent
6b7af735ae
commit
f3b6054432
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
galaxya
|
||||
csv
|
||||
69
archive.lisp
Normal file
69
archive.lisp
Normal file
@ -0,0 +1,69 @@
|
||||
;;;
|
||||
;;; Tools to handle archive files, like ZIP of CSV files
|
||||
;;;
|
||||
|
||||
(in-package #:pgloader.archive)
|
||||
|
||||
(let (l)
|
||||
(zip:with-zipfile
|
||||
(z "/Users/dim/dev/CL/pgloader/test/lahman2012-csv.zip")
|
||||
(zip:do-zipfile-entries (name x z) (push name l))) l)
|
||||
|
||||
(defun import-csv-from-zip (zip-filename
|
||||
&key
|
||||
(create-tables nil)
|
||||
(truncate t)
|
||||
(null-as ""))
|
||||
"Parse a ZIP file found at FILENAME and import all the CSV files found.
|
||||
|
||||
We only try to import data from files named `*.zip`, and we consider that
|
||||
the first line of such files are containing the names of the columns to
|
||||
import."
|
||||
(declare (ignore truncate))
|
||||
(zip:with-zipfile (zip zip-filename)
|
||||
(zip:do-zipfile-entries (filename entry zip)
|
||||
(format t "file: ~a~%" filename)
|
||||
(when (string= "csv" (pathname-type filename))
|
||||
(flex:with-input-from-sequence
|
||||
(stream (zip:zipfile-entry-contents entry))
|
||||
(let* ((fmt (flex:make-external-format :utf-8 :eol-style :lf))
|
||||
(u-stream (flex:make-flexi-stream stream :external-format fmt))
|
||||
(header (read-line u-stream))
|
||||
;; reconsider the format when the last char of the header
|
||||
;; is actually #\Return
|
||||
(header
|
||||
(if (char= #\Return (aref header (- (length header) 1)))
|
||||
(progn
|
||||
(setf fmt
|
||||
(flex:make-external-format :utf-8 :eol-style :crlf)
|
||||
u-stream
|
||||
(flex:make-flexi-stream stream :external-format fmt))
|
||||
(string-right-trim (list #\Return) header))
|
||||
header))
|
||||
(first-data-line (read-line u-stream))
|
||||
;;
|
||||
;; to guess the separator from the header, find the most
|
||||
;; frequent separator candidate
|
||||
(sep-counts (loop
|
||||
for sep in pgloader.csv::*separators*
|
||||
collect (cons sep (count sep header))))
|
||||
(separator (car
|
||||
(first (sort sep-counts #'> :key #'cdr))))
|
||||
;;
|
||||
;; now get the column names
|
||||
(col-names
|
||||
(mapcar #'camelCase-to-colname
|
||||
(split-sequence:split-sequence separator header))))
|
||||
|
||||
(format t " separator: ~a~% columns: ~a~%" separator col-names)
|
||||
(when create-tables
|
||||
(format t "CREATE TABLE ~a (~{~a~^, ~});~%"
|
||||
(pathname-name filename) col-names))
|
||||
(format t "first line: ~a~%" first-data-line)
|
||||
|
||||
;; then from the first line of data, guess the column data types
|
||||
|
||||
;; now create the table schema and begin importing the data
|
||||
|
||||
))))))
|
||||
|
||||
24
package.lisp
24
package.lisp
@ -30,7 +30,8 @@
|
||||
#:pgtable-reject-logs
|
||||
#:report-pgtable-stats
|
||||
#:report-pgstate-stats
|
||||
#:slurp-file-into-string))
|
||||
#:slurp-file-into-string
|
||||
#:camelCase-to-colname))
|
||||
|
||||
(defpackage #:pgloader.transforms
|
||||
(:use #:cl))
|
||||
@ -71,6 +72,27 @@
|
||||
#:guess-csv-params
|
||||
#:guess-all-csv-params))
|
||||
|
||||
(defpackage #:pgloader.archive
|
||||
(:use #:cl #:pgloader.params #:pgloader.csv)
|
||||
(:import-from #:pgloader.utils
|
||||
#:log-message
|
||||
#:report-header
|
||||
#:report-table-name
|
||||
#:report-results
|
||||
#:report-footer
|
||||
#:format-interval
|
||||
#:timing
|
||||
#:make-pgstate
|
||||
#:pgstate-get-table
|
||||
#:pgstate-add-table
|
||||
#:pgstate-setf
|
||||
#:pgstate-incf
|
||||
#:pgstate-decf
|
||||
#:report-pgtable-stats
|
||||
#:report-pgstate-stats
|
||||
#:camelCase-to-colname)
|
||||
(:export #:import-csv-from-zip))
|
||||
|
||||
(defpackage #:pgloader.mysql
|
||||
(:use #:cl #:pgloader.params)
|
||||
(:import-from #:pgloader.utils
|
||||
|
||||
@ -16,6 +16,8 @@
|
||||
#:lparallel ; threads, workers, queues
|
||||
#:esrap ; parser generator
|
||||
#:alexandria ; utils
|
||||
#:zip ; support for zip archive files
|
||||
#:flexi-streams ; streams
|
||||
#:command-line-arguments ; for the main function
|
||||
#:uiop ; portability layer (quit, argv, etc)
|
||||
)
|
||||
@ -26,8 +28,9 @@
|
||||
;; those are one-package-per-file
|
||||
(:file "parser" :depends-on ("package" "params"))
|
||||
(:file "transforms")
|
||||
(:file "queue" :depends-on ("package")) ; package pgloader.queue
|
||||
(:file "csv" :depends-on ("package")) ; package pgloader.csv
|
||||
(:file "queue" :depends-on ("package")) ; pgloader.queue
|
||||
(:file "csv" :depends-on ("package")) ; pgloader.csv
|
||||
(:file "archive" :depends-on ("package")) ; pgloader.archive
|
||||
|
||||
;; package pgloader.pgsql
|
||||
(:file "pgsql" :depends-on ("package"
|
||||
|
||||
19
utils.lisp
19
utils.lisp
@ -215,3 +215,22 @@
|
||||
;; http://www.ymeme.com/slurping-a-file-common-lisp-83.html
|
||||
(setf (fill-pointer seq) (read-sequence seq stream))
|
||||
seq)))
|
||||
|
||||
;;;
|
||||
;;; Camel Case converter
|
||||
;;;
|
||||
(defun camelCase-to-colname (string)
|
||||
"Transform input STRING into a suitable column name.
|
||||
lahmanID lahman_id
|
||||
playerID player_id
|
||||
birthYear birth_year"
|
||||
(coerce
|
||||
(loop
|
||||
for first = t then nil
|
||||
for char across string
|
||||
for previous-upper-p = nil then char-upper-p
|
||||
for char-upper-p = (eq char (char-upcase char))
|
||||
for new-word = (and (not first) char-upper-p (not previous-upper-p))
|
||||
when (and new-word (not (char= char #\_))) collect #\_
|
||||
collect (char-downcase char))
|
||||
'string))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user