Oops, forgot to add the new file charsets.lisp.

This commit is contained in:
Dimitri Fontaine 2014-04-26 18:55:43 +02:00
parent 35ca4927e9
commit 78a988eb47

262
src/charsets.lisp Normal file
View File

@ -0,0 +1,262 @@
(in-package #:pgloader.utils)
(defparameter *ccl-describe-character-encodings*
":CP936 [Aliases: :GBK :MS936 :WINDOWS-936]
An 8-bit, variable-length character encoding in which
character code points in the range #x00-#x80 can be encoded in a
single octet; characters with larger code values can be encoded
in 2 bytes.
Alias :gbk :ms936 :windows-936
:EUC-JP [Aliases: :EUCJP]
An 8-bit, variable-length character encoding in which
character code points in the range #x00-#x7f can be encoded in a
single octet; characters with larger code values can be encoded
in 2 to 3 bytes.
:GB2312 [Aliases: :GB2312-80 :GB2312-1980 :EUC-CN :EUCCN]
An 8-bit, variable-length character encoding in which
character code points in the range #x00-#x80 can be encoded in a
single octet; characters with larger code values can be encoded
in 2 bytes.
Alias :gb2312-80 :gb2312-1980 :euc-cn :euccn
:ISO-8859-1 [Aliases: :ISO-LATIN-1 :LATIN-1 NIL :ISO_8859-1 :LATIN1 :L1 :IBM819 :CP819 :CSISOLATIN1]
An 8-bit, fixed-width character encoding in which all character
codes map to their Unicode equivalents. Intended to support most
characters used in most Western European languages.
:ISO-8859-10 [Aliases: :ISO-LATIN-6 :LATIN-6 :ISO_8859-10 :LATIN6 :CSISOLATIN6 :ISO-IR-157]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in Nordic
alphabets.
:ISO-8859-11
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found the Thai
alphabet.
:ISO-8859-13
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in Baltic
alphabets.
:ISO-8859-14 [Aliases: :ISO-LATIN-8 :LATIN-8 :ISO_8859-14 :ISO-IR-199 :LATIN8 :L8 :ISO-CELTIC]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in Celtic
languages.
:ISO-8859-15 [Aliases: :ISO-LATIN-9 :LATIN-9 :ISO_8859-15 :LATIN9]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in Western
European languages (including the Euro sign and some other characters
missing from ISO-8859-1.
:ISO-8859-16 [Aliases: :ISO-LATIN-10 :LATIN-10 :ISO_8859-16 :LATIN10 :L1 :ISO-IR-226]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in Southeast
European languages.
:ISO-8859-2 [Aliases: :ISO-LATIN-2 :LATIN-2 :ISO_8859-2 :LATIN2 :L2 :CSISOLATIN2]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in most
languages used in Central/Eastern Europe.
:ISO-8859-3 [Aliases: :ISO-LATIN-3 :LATIN-3 :ISO_8859-3 :LATIN3 :L3 :CSISOLATIN3]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in most
languages used in Southern Europe.
:ISO-8859-4 [Aliases: :ISO-LATIN-4 :LATIN-4 :ISO_8859-4 :LATIN4 :L4 :CSISOLATIN4]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in most
languages used in Northern Europe.
:ISO-8859-5 [Aliases: :ISO_8859-5 :CYRILLIC :CSISOLATINCYRILLIC :ISO-IR-144]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in the
Cyrillic alphabet.
:ISO-8859-6 [Aliases: :ISO_8859-6 :ARABIC :CSISOLATINARABIC :ISO-IR-127]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in the
Arabic alphabet.
:ISO-8859-7 [Aliases: :ISO_8859-7 :GREEK :GREEK8 :CSISOLATINGREEK :ISO-IR-126 :ELOT_928 :ECMA-118]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in the
Greek alphabet.
:ISO-8859-8 [Aliases: :ISO_8859-8 :HEBREW :CSISOLATINHEBREW :ISO-IR-138]
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in the
Hebrew alphabet.
:ISO-8859-9 [Aliases: :ISO-LATIN-5 :LATIN-5 :ISO_8859-9 :LATIN5 :CSISOLATIN5 :ISO-IR-148]
An 8-bit, fixed-width character encoding in which codes #x00-#xcf
map to their Unicode equivalents and other codes map to other Unicode
character values. Intended to provide most characters found in the
Turkish alphabet.
:MACINTOSH [Aliases: :MACOS-ROMAN :MACOSROMAN :MAC-ROMAN :MACROMAN]
An 8-bit, fixed-width character encoding in which codes #x00-#x7f
map to their Unicode equivalents and other codes map to other Unicode
character values. Traditionally used on Classic MacOS to encode characters
used in western languages.
:UCS-2
A 16-bit, fixed-length encoding in which characters with
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit word.
The endianness of the encoded data is indicated by the endianness of a
byte-order-mark character (#u+feff) prepended to the data; in the
absence of such a character on input, the data is assumed to be in
big-endian order.
:UCS-2BE
A 16-bit, fixed-length encoding in which characters with
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
big-endian word. The encoded data is implicitly big-endian;
byte-order-mark characters are not interpreted on input or prepended
to output.
:UCS-2LE
A 16-bit, fixed-length encoding in which characters with
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
little-endian word. The encoded data is implicitly little-endian;
byte-order-mark characters are not interpreted on input or prepended
to output.
:US-ASCII [Aliases: :CSASCII :CP637 :IBM637 :US :ISO646-US :ASCII :ISO-IR-6]
A 7-bit, fixed-width character encoding in which all character
codes map to their Unicode equivalents.
:UTF-16
A 16-bit, variable-length encoding in which characters with
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
word and characters with larger codes can be encoded in a
pair of 16-bit words. The endianness of the encoded data is
indicated by the endianness of a byte-order-mark character (#u+feff)
prepended to the data; in the absence of such a character on input,
the data is assumed to be in big-endian order. Output is written
in native byte-order with a leading byte-order mark.
:UTF-16BE
A 16-bit, variable-length encoding in which characters with
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
big-endian word and characters with larger codes can be encoded in a
pair of 16-bit big-endian words. The endianness of the encoded data
is implicit in the encoding; byte-order-mark characters are not
interpreted on input or prepended to output.
:UTF-16LE
A 16-bit, variable-length encoding in which characters with
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
little-endian word and characters with larger codes can be encoded in
a pair of 16-bit little-endian words. The endianness of the encoded
data is implicit in the encoding; byte-order-mark characters are not
interpreted on input or prepended to output.
:UTF-32 [Aliases: :UCS-4]
A 32-bit, fixed-length encoding in which all Unicode characters
can be encoded in a single 32-bit word. The endianness of the encoded
data is indicated by the endianness of a byte-order-mark
character (#u+feff) prepended to the data; in the absence of such a
character on input, input data is assumed to be in big-endian order.
Output is written in native byte order with a leading byte-order
mark.
:UTF-32BE [Aliases: :UCS-4BE]
A 32-bit, fixed-length encoding in which all Unicode characters
encoded in a single 32-bit word. The encoded data is implicitly big-endian;
byte-order-mark characters are not interpreted on input or prepended
to output.
:UTF-32LE [Aliases: :UCS-4LE]
A 32-bit, fixed-length encoding in which all Unicode characters can
encoded in a single 32-bit word. The encoded data is implicitly
little-endian; byte-order-mark characters are not interpreted on input
or prepended to output.
:UTF-8 [Aliases: :MULE-UTF-8]
An 8-bit, variable-length character encoding in which characters
with CHAR-CODEs in the range #x00-#x7f can be encoded in a single
octet; characters with larger code values can be encoded in 2 to 4
bytes.
:WINDOWS-31J [Aliases: :CP932 :CSWINDOWS31J]
An 8-bit, variable-length character encoding in which
character code points in the range #x00-#x7f can be encoded in a
single octet; characters with larger code values can be encoded
in 2 bytes.
")
(defun parse-ccl-encodings-desc-first-line (line)
"Given a line with :ENCODING [Aliases: :X :Y] return a proper cons."
(cl-ppcre:register-groups-bind (name aliases)
(":([A-Z0-9-]+).*Aliases: (.*)[]]" line)
(cons name (mapcar (lambda (alias) (subseq alias 1))
(split-sequence:split-sequence #\Space aliases)))))
(defun parse-ccl-encodings-desc (&optional
(desc *ccl-describe-character-encodings*))
"Parse the output of the ccl:describe-character-encodings function."
(with-input-from-string (s desc)
(loop :for line := (read-line s nil nil)
:while line
:when (and line (< 0 (length line)) (char= #\: (aref line 0)))
:collect (parse-ccl-encodings-desc-first-line line))))
(defun list-encodings-and-aliases ()
"Return an alist of encoding names supported by the current
implementation, associated with a list of encoding name aliases for each
of them."
(let ((encoding-and-aliases
#+ccl
(parse-ccl-encoding-desc (with-output-to-string (*terminal-io*)
(ccl:describe-character-encodings)))
#+sbcl
(let ((result '()))
(maphash (lambda (name encoding)
(declare (ignore name))
(pushnew encoding result))
sb-impl::*external-formats*)
(mapcar (lambda (encoding)
(mapcar (function string-upcase)
(slot-value encoding 'sb-impl::names)))
result))))
(sort encoding-and-aliases #'string< :key #'car)))
(defun show-encodings ()
"List known encodings names and aliases from charsets::*lisp-encodings*."
(format *standard-output* "Name ~30TAliases~%")
(format *standard-output* "--------~30T--------------~%")
(loop
:with encodings := (list-encodings-and-aliases)
:for (name . aliases) :in encodings
:do (format *standard-output* "~a~30T~{~a~^, ~}~%" name aliases))
(terpri))
(defun make-external-format (name)
"Return an object suitable as an external format in the current
implementation."
(let ((encoding (intern name "KEYWORD")))
#+ccl
(ccl:make-external-format :character-encoding encoding)
#+sbcl
encoding))