mirror of
https://github.com/dimitri/pgloader.git
synced 2026-05-04 18:36:12 +02:00
Oops, forgot to add the new file charsets.lisp.
This commit is contained in:
parent
35ca4927e9
commit
78a988eb47
262
src/charsets.lisp
Normal file
262
src/charsets.lisp
Normal file
@ -0,0 +1,262 @@
|
||||
(in-package #:pgloader.utils)
|
||||
|
||||
|
||||
(defparameter *ccl-describe-character-encodings*
|
||||
":CP936 [Aliases: :GBK :MS936 :WINDOWS-936]
|
||||
An 8-bit, variable-length character encoding in which
|
||||
character code points in the range #x00-#x80 can be encoded in a
|
||||
single octet; characters with larger code values can be encoded
|
||||
in 2 bytes.
|
||||
|
||||
Alias :gbk :ms936 :windows-936
|
||||
|
||||
:EUC-JP [Aliases: :EUCJP]
|
||||
An 8-bit, variable-length character encoding in which
|
||||
character code points in the range #x00-#x7f can be encoded in a
|
||||
single octet; characters with larger code values can be encoded
|
||||
in 2 to 3 bytes.
|
||||
|
||||
:GB2312 [Aliases: :GB2312-80 :GB2312-1980 :EUC-CN :EUCCN]
|
||||
An 8-bit, variable-length character encoding in which
|
||||
character code points in the range #x00-#x80 can be encoded in a
|
||||
single octet; characters with larger code values can be encoded
|
||||
in 2 bytes.
|
||||
|
||||
Alias :gb2312-80 :gb2312-1980 :euc-cn :euccn
|
||||
|
||||
:ISO-8859-1 [Aliases: :ISO-LATIN-1 :LATIN-1 NIL :ISO_8859-1 :LATIN1 :L1 :IBM819 :CP819 :CSISOLATIN1]
|
||||
An 8-bit, fixed-width character encoding in which all character
|
||||
codes map to their Unicode equivalents. Intended to support most
|
||||
characters used in most Western European languages.
|
||||
|
||||
:ISO-8859-10 [Aliases: :ISO-LATIN-6 :LATIN-6 :ISO_8859-10 :LATIN6 :CSISOLATIN6 :ISO-IR-157]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in Nordic
|
||||
alphabets.
|
||||
|
||||
:ISO-8859-11
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found the Thai
|
||||
alphabet.
|
||||
|
||||
:ISO-8859-13
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in Baltic
|
||||
alphabets.
|
||||
|
||||
:ISO-8859-14 [Aliases: :ISO-LATIN-8 :LATIN-8 :ISO_8859-14 :ISO-IR-199 :LATIN8 :L8 :ISO-CELTIC]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in Celtic
|
||||
languages.
|
||||
|
||||
:ISO-8859-15 [Aliases: :ISO-LATIN-9 :LATIN-9 :ISO_8859-15 :LATIN9]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in Western
|
||||
European languages (including the Euro sign and some other characters
|
||||
missing from ISO-8859-1.
|
||||
|
||||
:ISO-8859-16 [Aliases: :ISO-LATIN-10 :LATIN-10 :ISO_8859-16 :LATIN10 :L1 :ISO-IR-226]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in Southeast
|
||||
European languages.
|
||||
|
||||
:ISO-8859-2 [Aliases: :ISO-LATIN-2 :LATIN-2 :ISO_8859-2 :LATIN2 :L2 :CSISOLATIN2]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in most
|
||||
languages used in Central/Eastern Europe.
|
||||
|
||||
:ISO-8859-3 [Aliases: :ISO-LATIN-3 :LATIN-3 :ISO_8859-3 :LATIN3 :L3 :CSISOLATIN3]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in most
|
||||
languages used in Southern Europe.
|
||||
|
||||
:ISO-8859-4 [Aliases: :ISO-LATIN-4 :LATIN-4 :ISO_8859-4 :LATIN4 :L4 :CSISOLATIN4]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in most
|
||||
languages used in Northern Europe.
|
||||
|
||||
:ISO-8859-5 [Aliases: :ISO_8859-5 :CYRILLIC :CSISOLATINCYRILLIC :ISO-IR-144]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in the
|
||||
Cyrillic alphabet.
|
||||
|
||||
:ISO-8859-6 [Aliases: :ISO_8859-6 :ARABIC :CSISOLATINARABIC :ISO-IR-127]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in the
|
||||
Arabic alphabet.
|
||||
|
||||
:ISO-8859-7 [Aliases: :ISO_8859-7 :GREEK :GREEK8 :CSISOLATINGREEK :ISO-IR-126 :ELOT_928 :ECMA-118]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in the
|
||||
Greek alphabet.
|
||||
|
||||
:ISO-8859-8 [Aliases: :ISO_8859-8 :HEBREW :CSISOLATINHEBREW :ISO-IR-138]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x9f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in the
|
||||
Hebrew alphabet.
|
||||
|
||||
:ISO-8859-9 [Aliases: :ISO-LATIN-5 :LATIN-5 :ISO_8859-9 :LATIN5 :CSISOLATIN5 :ISO-IR-148]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#xcf
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Intended to provide most characters found in the
|
||||
Turkish alphabet.
|
||||
|
||||
:MACINTOSH [Aliases: :MACOS-ROMAN :MACOSROMAN :MAC-ROMAN :MACROMAN]
|
||||
An 8-bit, fixed-width character encoding in which codes #x00-#x7f
|
||||
map to their Unicode equivalents and other codes map to other Unicode
|
||||
character values. Traditionally used on Classic MacOS to encode characters
|
||||
used in western languages.
|
||||
|
||||
:UCS-2
|
||||
A 16-bit, fixed-length encoding in which characters with
|
||||
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit word.
|
||||
The endianness of the encoded data is indicated by the endianness of a
|
||||
byte-order-mark character (#u+feff) prepended to the data; in the
|
||||
absence of such a character on input, the data is assumed to be in
|
||||
big-endian order.
|
||||
|
||||
:UCS-2BE
|
||||
A 16-bit, fixed-length encoding in which characters with
|
||||
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
|
||||
big-endian word. The encoded data is implicitly big-endian;
|
||||
byte-order-mark characters are not interpreted on input or prepended
|
||||
to output.
|
||||
|
||||
:UCS-2LE
|
||||
A 16-bit, fixed-length encoding in which characters with
|
||||
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
|
||||
little-endian word. The encoded data is implicitly little-endian;
|
||||
byte-order-mark characters are not interpreted on input or prepended
|
||||
to output.
|
||||
|
||||
:US-ASCII [Aliases: :CSASCII :CP637 :IBM637 :US :ISO646-US :ASCII :ISO-IR-6]
|
||||
A 7-bit, fixed-width character encoding in which all character
|
||||
codes map to their Unicode equivalents.
|
||||
|
||||
:UTF-16
|
||||
A 16-bit, variable-length encoding in which characters with
|
||||
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
|
||||
word and characters with larger codes can be encoded in a
|
||||
pair of 16-bit words. The endianness of the encoded data is
|
||||
indicated by the endianness of a byte-order-mark character (#u+feff)
|
||||
prepended to the data; in the absence of such a character on input,
|
||||
the data is assumed to be in big-endian order. Output is written
|
||||
in native byte-order with a leading byte-order mark.
|
||||
|
||||
:UTF-16BE
|
||||
A 16-bit, variable-length encoding in which characters with
|
||||
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
|
||||
big-endian word and characters with larger codes can be encoded in a
|
||||
pair of 16-bit big-endian words. The endianness of the encoded data
|
||||
is implicit in the encoding; byte-order-mark characters are not
|
||||
interpreted on input or prepended to output.
|
||||
|
||||
:UTF-16LE
|
||||
A 16-bit, variable-length encoding in which characters with
|
||||
CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
|
||||
little-endian word and characters with larger codes can be encoded in
|
||||
a pair of 16-bit little-endian words. The endianness of the encoded
|
||||
data is implicit in the encoding; byte-order-mark characters are not
|
||||
interpreted on input or prepended to output.
|
||||
|
||||
:UTF-32 [Aliases: :UCS-4]
|
||||
A 32-bit, fixed-length encoding in which all Unicode characters
|
||||
can be encoded in a single 32-bit word. The endianness of the encoded
|
||||
data is indicated by the endianness of a byte-order-mark
|
||||
character (#u+feff) prepended to the data; in the absence of such a
|
||||
character on input, input data is assumed to be in big-endian order.
|
||||
Output is written in native byte order with a leading byte-order
|
||||
mark.
|
||||
|
||||
:UTF-32BE [Aliases: :UCS-4BE]
|
||||
A 32-bit, fixed-length encoding in which all Unicode characters
|
||||
encoded in a single 32-bit word. The encoded data is implicitly big-endian;
|
||||
byte-order-mark characters are not interpreted on input or prepended
|
||||
to output.
|
||||
|
||||
:UTF-32LE [Aliases: :UCS-4LE]
|
||||
A 32-bit, fixed-length encoding in which all Unicode characters can
|
||||
encoded in a single 32-bit word. The encoded data is implicitly
|
||||
little-endian; byte-order-mark characters are not interpreted on input
|
||||
or prepended to output.
|
||||
|
||||
:UTF-8 [Aliases: :MULE-UTF-8]
|
||||
An 8-bit, variable-length character encoding in which characters
|
||||
with CHAR-CODEs in the range #x00-#x7f can be encoded in a single
|
||||
octet; characters with larger code values can be encoded in 2 to 4
|
||||
bytes.
|
||||
|
||||
:WINDOWS-31J [Aliases: :CP932 :CSWINDOWS31J]
|
||||
An 8-bit, variable-length character encoding in which
|
||||
character code points in the range #x00-#x7f can be encoded in a
|
||||
single octet; characters with larger code values can be encoded
|
||||
in 2 bytes.
|
||||
")
|
||||
|
||||
(defun parse-ccl-encodings-desc-first-line (line)
|
||||
"Given a line with :ENCODING [Aliases: :X :Y] return a proper cons."
|
||||
(cl-ppcre:register-groups-bind (name aliases)
|
||||
(":([A-Z0-9-]+).*Aliases: (.*)[]]" line)
|
||||
(cons name (mapcar (lambda (alias) (subseq alias 1))
|
||||
(split-sequence:split-sequence #\Space aliases)))))
|
||||
|
||||
(defun parse-ccl-encodings-desc (&optional
|
||||
(desc *ccl-describe-character-encodings*))
|
||||
"Parse the output of the ccl:describe-character-encodings function."
|
||||
(with-input-from-string (s desc)
|
||||
(loop :for line := (read-line s nil nil)
|
||||
:while line
|
||||
:when (and line (< 0 (length line)) (char= #\: (aref line 0)))
|
||||
:collect (parse-ccl-encodings-desc-first-line line))))
|
||||
|
||||
(defun list-encodings-and-aliases ()
|
||||
"Return an alist of encoding names supported by the current
|
||||
implementation, associated with a list of encoding name aliases for each
|
||||
of them."
|
||||
(let ((encoding-and-aliases
|
||||
#+ccl
|
||||
(parse-ccl-encoding-desc (with-output-to-string (*terminal-io*)
|
||||
(ccl:describe-character-encodings)))
|
||||
#+sbcl
|
||||
(let ((result '()))
|
||||
(maphash (lambda (name encoding)
|
||||
(declare (ignore name))
|
||||
(pushnew encoding result))
|
||||
sb-impl::*external-formats*)
|
||||
(mapcar (lambda (encoding)
|
||||
(mapcar (function string-upcase)
|
||||
(slot-value encoding 'sb-impl::names)))
|
||||
result))))
|
||||
(sort encoding-and-aliases #'string< :key #'car)))
|
||||
|
||||
(defun show-encodings ()
|
||||
"List known encodings names and aliases from charsets::*lisp-encodings*."
|
||||
(format *standard-output* "Name ~30TAliases~%")
|
||||
(format *standard-output* "--------~30T--------------~%")
|
||||
(loop
|
||||
:with encodings := (list-encodings-and-aliases)
|
||||
:for (name . aliases) :in encodings
|
||||
:do (format *standard-output* "~a~30T~{~a~^, ~}~%" name aliases))
|
||||
(terpri))
|
||||
|
||||
(defun make-external-format (name)
|
||||
"Return an object suitable as an external format in the current
|
||||
implementation."
|
||||
(let ((encoding (intern name "KEYWORD")))
|
||||
#+ccl
|
||||
(ccl:make-external-format :character-encoding encoding)
|
||||
#+sbcl
|
||||
encoding))
|
||||
Loading…
x
Reference in New Issue
Block a user