pgloader/archive.lisp

;;;
;;; Tools to handle archive files, like ZIP of CSV files
;;;

(in-package #:pgloader.archive)

(let (l)
  (zip:with-zipfile
      (z "/Users/dim/dev/CL/pgloader/test/lahman2012-csv.zip")
    (zip:do-zipfile-entries (name x z) (push name l))) l)

(defun import-csv-from-zip (zip-filename
			    &key
			      (create-tables nil)
			      (truncate t)
			      (null-as ""))
  "Parse a ZIP file found at FILENAME and import all the CSV files found.

   We only try to import data from files named `*.zip`, and we consider that
   the first line of such files are containing the names of the columns to
   import."
  (declare (ignore truncate))
  (zip:with-zipfile (zip zip-filename)
    (zip:do-zipfile-entries (filename entry zip)
      (format t "file: ~a~%" filename)
      (when (string= "csv" (pathname-type filename))
	(flex:with-input-from-sequence
	    (stream (zip:zipfile-entry-contents entry))
	  (let* ((fmt       (flex:make-external-format :utf-8 :eol-style :lf))
		 (u-stream  (flex:make-flexi-stream stream :external-format fmt))
		 (header    (read-line u-stream))
		 ;; reconsider the format when the last char of the header
		 ;; is actually #\Return
		 (header
		  (if (char= #\Return (aref header (- (length header) 1)))
		      (progn
			(setf fmt
			      (flex:make-external-format :utf-8 :eol-style :crlf)
			      u-stream
			      (flex:make-flexi-stream stream :external-format fmt))
			(string-right-trim (list #\Return) header))
		      header))
		 (first-data-line (read-line u-stream))
		 ;;
		 ;; to guess the separator from the header, find the most
		 ;; frequent separator candidate
		 (sep-counts      (loop
				     for sep in pgloader.csv::*separators*
				     collect (cons sep (count sep header))))
		 (separator       (car
				   (first (sort sep-counts #'> :key #'cdr))))
		 ;;
		 ;; now get the column names
		 (col-names
		  (mapcar #'camelCase-to-colname
			  (split-sequence:split-sequence separator header))))

	    (format t "  separator: ~a~%    columns: ~a~%" separator col-names)
	    (when create-tables
	      (format t "CREATE TABLE ~a (~{~a~^, ~});~%"
		      (pathname-name filename) col-names))
	    (format t "first line: ~a~%" first-data-line)

	    ;; then from the first line of data, guess the column data types

	    ;; now create the table schema and begin importing the data

	    ))))))