Add support for PG options as arguments and in the config file, with assorted fixes

This commit is contained in:
Dimitri Fontaine 2010-04-05 21:17:22 +02:00
parent d63b21c3f9
commit 411a9a53d7
9 changed files with 136 additions and 87 deletions

2
.gitignore vendored
View File

@ -13,3 +13,5 @@
/pgloader/RRRtools.pyc
/reformat/mysql.pyc
/reformat/pgtime.pyc
/pgloader.1
/pgloader.1.html

3
debian/changelog vendored
View File

@ -4,6 +4,9 @@ pgloader (2.3.3-1) unstable; urgency=low
* Add support for filename arguments, which use defaults
* Implement --reject-log and --reject-data
* Add support for --max-parallel-sections and --section-threads
* Support setting any PG option (-o and config file)
* Have --debug show a traceback
* Fix a bug where pgloader would freeze on early error (no such file)
-- Dimitri Fontaine <dim@tapoueh.org> Sun, 4 Apr 2010 19:34:39 +0200

View File

@ -8,10 +8,15 @@ pass = None
log_file = /tmp/pgloader.log
log_min_messages = DEBUG
client_min_messages = WARNING
lc_messages = C
;client_encoding = 'utf-8'
client_encoding = 'latin1'
lc_messages = C
pg_option_client_encoding = 'utf-8'
pg_option_standard_conforming_strings = on
; This setting has no effect other than allowing to check option precedence
pg_option_work_mem = 12MB
copy_every = 5
commit_every = 5
#copy_delimiter = %
@ -45,6 +50,7 @@ format = csv
filename = allcols/allcols.data
field_sep = :
columns = *
pg_option_work_mem = 14MB
[errors]
table = errors

View File

@ -42,7 +42,7 @@ errors is issued.
http://pgfoundry.org/projects/pgloader/[], where you'll find a debian
package, a source package and an anonymous CVS.
== Arguments ==
== ARGUMENTS ==
+pgloader+ as of version +2.3.3+ accepts two kinds of arguments, either
section names of file names. If both a section and a file exist with the
@ -178,6 +178,14 @@ You can't use both -F and -I at the same time.
Input data files encoding. Defaults to 'latin9'.
-o, --pg-options::
+
Any option to give to the PostgreSQL server by mean of the +SET+
command. You can use this argument more than once to set more than one
option.
+
Example: -o standard_conforming_strings=on -o client_encoding=utf8
-t, --section-threads::
How many threads per section to use, defaults to 1. The command line
@ -245,18 +253,30 @@ pass::
client_encoding::
+
Set this parameter to have pgloader connects to PostgreSQL using this
Set this parameter to have +pgloader+ connects to PostgreSQL using this
encoding.
+
This parameter is optional and defaults to 'latin9'.
+
As of +pgloader 2.3.3+ you can also use +pg_option_client_encoding+ which is
the more general approach.
datestyle::
+
Set this parameter to have pgloader connects to PostgreSQL using this
Set this parameter to have +pgloader+ connects to PostgreSQL using this
datestyle setting.
+
This parameter is optional and has no default value, thus pgloader will
use whatever your PostgreSQL is configured to as default.
+
As of +pgloader 2.3.3+ you can also use +pg_option_datestyle+ which is
the more general approach.
pg_option_<foo>::
Replace <foo> with any option you're allowed to setup for the session
only with the +SET+ command, and +pgloader+ will do just that for
you. Consider for example +pg_option_standard_conforming_strings = on+.
copy_every::
+
@ -436,19 +456,30 @@ This parameter is optional and defaults to pipe char +$$'|'$$+.
client_encoding::
+
Set this parameter to have pgloader connects to PostgreSQL using this
Set this parameter to have +pgloader+ connects to PostgreSQL using this
encoding.
+
This parameter is optional and defaults to 'latin9'. If defined on a
table level, this local value will overwrite the global one.
This parameter is optional and defaults to 'latin9'.
+
As of +pgloader 2.3.3+ you can also use +pg_option_client_encoding+ which is
the more general approach.
datestyle::
+
Set this parameter to have pgloader connects to PostgreSQL using this
+datestyle+ setting.
Set this parameter to have +pgloader+ connects to PostgreSQL using this
datestyle setting.
+
This parameter is optional and has no default. If defined on a table
level, this local value will overwrite the global one.
This parameter is optional and has no default value, thus pgloader will
use whatever your PostgreSQL is configured to as default.
+
As of +pgloader 2.3.3+ you can also use +pg_option_datestyle+ which is
the more general approach.
pg_option_<foo>::
Replace <foo> with any option you're allowed to setup for the session
only with the +SET+ command, and +pgloader+ will do just that for
you. Consider for example +pg_option_standard_conforming_strings = on+.
null::
+

View File

@ -111,6 +111,9 @@ def parse_options():
default = None,
help = "input files encoding")
parser.add_option("-o", "--pg-options", dest = "pg_options", action = "append",
help = "list of PostgreSQL options you want to SET")
parser.add_option("-t", "--section-threads", dest = "section_threads",
default = pgloader.options.SECTION_THREADS,
type = "int",
@ -252,6 +255,18 @@ def parse_options():
elif opts.quiet:
pgloader.options.CLIENT_MIN_MESSAGES = logging.ERROR
if opts.pg_options:
pgloader.options.PG_OPTIONS = {}
for o in opts.pg_options:
try:
n, v = [x.strip() for x in o.split('=')]
if v == "":
raise ValueError
pgloader.options.PG_OPTIONS[n] = v
except ValueError, e:
print >>sys.stderr, \
"Error: PostgreSQL options must have the form 'name=value'"
sys.exit(1)
if opts.psycopg1:
pgloader.options.PSYCOPG_VERSION = 1
@ -285,6 +300,7 @@ def parse_config(conffile):
from pgloader.options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC
from pgloader.options import NULL, EMPTY_STRING
from pgloader.options import CLIENT_MIN_MESSAGES, LOG_FILE
from pgloader.options import PG_OPTIONS
from pgloader.tools import check_dirname
# first read the logging configuration
@ -605,6 +621,8 @@ def load_data():
started[s] .set()
finished[s].set()
log.error(e)
if DEBUG:
raise
except IOError, e:
# No space left on device? can't log it
@ -632,6 +650,9 @@ def load_data():
else:
log.error('%s' % e)
if DEBUG:
raise
if PEDANTIC:
# was: threads[s].print_stats()
# but now thread[s] is no more alive
@ -679,6 +700,10 @@ if __name__ == "__main__":
try:
ret = load_data()
except Exception, e:
from pgloader.options import DEBUG
print DEBUG
if DEBUG:
raise
sys.stderr.write(str(e) + '\n')
sys.exit(1)

View File

@ -11,6 +11,7 @@ from options import TRUNCATE, VACUUM
from options import INPUT_ENCODING, PG_CLIENT_ENCODING, DATESTYLE
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
from options import PSYCOPG_VERSION
from options import PG_OPTIONS
from tools import PGLoader_Error
from logger import log
@ -66,11 +67,9 @@ class db:
self.copy_sep = COPY_SEP
self.copy_every = copy_every
self.commit_every = commit_every
self.client_encoding = client_encoding
self.datestyle = DATESTYLE
self.null = NULL
self.empty_string = EMPTY_STRING
self.lc_messages = None
self.pg_options = {}
# this allows to specify configuration has columns = *
# when true, we don't include column list in COPY statements
@ -107,51 +106,18 @@ class db:
pass
self.dbconn = None
def set_encoding(self):
""" set connection encoding to self.client_encoding """
# debug only cause reconnecting happens on every
# configured section
self.log.debug('Setting client encoding to %s', self.client_encoding)
def set_pg_options(self):
""" set pg_options """
for opt, val in self.pg_options.items():
self.log.debug('Setting %s to %s', opt, val)
sql = 'set session client_encoding to %s'
cursor = self.dbconn.cursor()
try:
cursor.execute(sql, [self.client_encoding])
except psycopg.ProgrammingError, e:
raise PGLoader_Error, e
cursor.close()
def set_datestyle(self):
""" set session datestyle to self.datestyle """
if self.datestyle is None:
return
# debug only cause reconnecting happens on every
# configured section
self.log.debug('Setting datestyle to %s', self.datestyle)
sql = 'set session datestyle to %s'
cursor = self.dbconn.cursor()
cursor.execute(sql, [self.datestyle])
cursor.close()
def set_lc_messages(self):
""" set lc_messages to self.lc_messages """
if self.lc_messages is None:
return
# debug only cause reconnecting happens on every
# configured section
self.log.debug('Setting lc_messages to %s', self.lc_messages)
sql = 'set session lc_messages to %s'
cursor = self.dbconn.cursor()
try:
cursor.execute(sql, [self.lc_messages])
except psycopg.ProgrammingError, e:
raise PGLoader_Error, e
cursor.close()
sql = 'set session %s to %%s' % opt
cursor = self.dbconn.cursor()
try:
cursor.execute(sql, [val])
except (psycopg.ProgrammingError, psycopg.DataError), e:
raise PGLoader_Error, e
cursor.close()
def get_all_columns(self, tablename):
""" select the columns name list from catalog """
@ -214,9 +180,7 @@ ORDER BY attnum
self.log.debug('Debug: connecting to dns %s', self.dsn)
self.dbconn = psycopg.connect(self.dsn)
self.set_encoding()
self.set_datestyle()
self.set_lc_messages()
self.set_pg_options()
except psycopg.OperationalError, e:
# e.g. too many connections

View File

@ -8,6 +8,7 @@ PSYCOPG_VERSION = None
INPUT_ENCODING = None
PG_CLIENT_ENCODING = 'latin9'
PG_OPTIONS = None
DATESTYLE = None
COPY_SEP = None

View File

@ -18,6 +18,7 @@ from options import TRUNCATE, VACUUM, TRIGGERS
from options import COUNT, FROM_COUNT, FROM_ID
from options import INPUT_ENCODING, PG_CLIENT_ENCODING
from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING
from options import PG_OPTIONS
from options import NEWLINE_ESCAPES
from options import UDC_PREFIX
from options import REFORMAT_PATH
@ -163,17 +164,15 @@ class PGLoader(threading.Thread):
config.get(section, 'pass'),
connect = False)
if config.has_option(section, 'client_encoding'):
self.db.client_encoding = parse_config_string(
config.get(section, 'client_encoding'))
for opt in ['client_encoding', 'datestyle', 'lc_messages']:
if config.has_option(section, opt):
self.db.pg_options[opt] = \
parse_config_string(config.get(section, opt))
if config.has_option(section, 'lc_messages'):
self.db.lc_messages = parse_config_string(
config.get(section, 'lc_messages'))
if config.has_option(section, 'datestyle'):
self.db.datestyle = parse_config_string(
config.get(section, 'datestyle'))
# PostgreSQL options
from tools import parse_pg_options
parse_pg_options(self.log, config, section, self.db.pg_options)
self.log.debug("_dbconfig: %s" % str(self.db.pg_options))
if config.has_option(section, 'copy_every'):
self.db.copy_every = config.getint(section, 'copy_every')
@ -260,29 +259,28 @@ class PGLoader(threading.Thread):
# needed to instanciate self.reject while in template section
self.reject = None
# optionnal local option client_encoding
if config.has_option(name, 'client_encoding'):
self.db.client_encoding = parse_config_string(
config.get(name, 'client_encoding'))
if not DRY_RUN:
self.log.debug("client_encoding: '%s'", self.db.client_encoding)
# optionnal local option input_encoding
self.input_encoding = INPUT_ENCODING
if config.has_option(name, 'input_encoding'):
self.input_encoding = parse_config_string(
config.get(name, 'input_encoding'))
self.log.debug("input_encoding: '%s'", self.input_encoding)
# optionnal local option datestyle
if not DRY_RUN and config.has_option(name, 'datestyle'):
self.db.datestyle = parse_config_string(
config.get(name, 'datestyle'))
# optionnal local option client_encoding and datestyle
for opt in ['client_encoding', 'datestyle']:
if config.has_option(name, opt):
self.db.pg_options[opt] = parse_config_string(config.get(name, opt))
if not DRY_RUN:
self.log.debug("datestyle: '%s'", self.db.datestyle)
if not DRY_RUN:
self.log.debug("%s: '%s'", opt, self.db.pg_options[opt])
# optionnal local pg_options
# precedence is given to command line parsing, which is in PG_OPTIONS
from tools import parse_pg_options
parse_pg_options(log, config, name, self.db.pg_options, overwrite=True)
if not self.template:
if PG_OPTIONS:
self.db.pg_options.update(PG_OPTIONS)
##
# data filename
@ -849,6 +847,7 @@ class PGLoader(threading.Thread):
except Exception, e:
# resources get freed in self.terminate()
self.terminate()
self.log.error(e)
raise

View File

@ -139,7 +139,25 @@ def parse_config_string(str):
return str
def parse_pg_options(log, config, section, pg_options, overwrite=False):
""" Get all the pg_options_ prefixed options from the section"""
# PostgreSQL options must begin with the prefix pg_option_
for o in [x for x in config.options(section)
if x.startswith('pg_option_')]:
opt = o[len('pg_option_'):]
val = config.get(section, o)
# hysterical raisins
for compat in ['client_encoding', 'lc_messages', 'datestyle']:
if opt == compat and config.has_option(section, compat):
log.warning("Ignoring %s.%s for %s.%s" \
% (section, o, section, opt))
if opt not in compat and (overwrite or opt not in pg_options):
pg_options[opt] = val
return pg_options
def read_path(strpath, log, path = [], check = True):
""" read a path configuration element, discarding non-existing entries """