diff --git a/.gitignore b/.gitignore index 10d5c27..5c80b3c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ /pgloader/RRRtools.pyc /reformat/mysql.pyc /reformat/pgtime.pyc +/pgloader.1 +/pgloader.1.html diff --git a/debian/changelog b/debian/changelog index eede21a..c6fda74 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,9 @@ pgloader (2.3.3-1) unstable; urgency=low * Add support for filename arguments, which use defaults * Implement --reject-log and --reject-data * Add support for --max-parallel-sections and --section-threads + * Support setting any PG option (-o and config file) + * Have --debug show a traceback + * Fix a bug where pgloader would freeze on early error (no such file) -- Dimitri Fontaine Sun, 4 Apr 2010 19:34:39 +0200 diff --git a/examples/pgloader.conf b/examples/pgloader.conf index d70d798..edf865e 100644 --- a/examples/pgloader.conf +++ b/examples/pgloader.conf @@ -8,10 +8,15 @@ pass = None log_file = /tmp/pgloader.log log_min_messages = DEBUG client_min_messages = WARNING -lc_messages = C ;client_encoding = 'utf-8' client_encoding = 'latin1' +lc_messages = C +pg_option_client_encoding = 'utf-8' +pg_option_standard_conforming_strings = on +; This setting has no effect other than allowing to check option precedence +pg_option_work_mem = 12MB + copy_every = 5 commit_every = 5 #copy_delimiter = % @@ -45,6 +50,7 @@ format = csv filename = allcols/allcols.data field_sep = : columns = * +pg_option_work_mem = 14MB [errors] table = errors diff --git a/pgloader.1.txt b/pgloader.1.txt index 58d3aca..9a1607c 100644 --- a/pgloader.1.txt +++ b/pgloader.1.txt @@ -42,7 +42,7 @@ errors is issued. http://pgfoundry.org/projects/pgloader/[], where you'll find a debian package, a source package and an anonymous CVS. -== Arguments == +== ARGUMENTS == +pgloader+ as of version +2.3.3+ accepts two kinds of arguments, either section names of file names. If both a section and a file exist with the @@ -178,6 +178,14 @@ You can't use both -F and -I at the same time. Input data files encoding. Defaults to 'latin9'. +-o, --pg-options:: ++ +Any option to give to the PostgreSQL server by mean of the +SET+ +command. You can use this argument more than once to set more than one +option. ++ +Example: -o standard_conforming_strings=on -o client_encoding=utf8 + -t, --section-threads:: How many threads per section to use, defaults to 1. The command line @@ -245,18 +253,30 @@ pass:: client_encoding:: + -Set this parameter to have pgloader connects to PostgreSQL using this +Set this parameter to have +pgloader+ connects to PostgreSQL using this encoding. + This parameter is optional and defaults to 'latin9'. ++ +As of +pgloader 2.3.3+ you can also use +pg_option_client_encoding+ which is +the more general approach. datestyle:: + -Set this parameter to have pgloader connects to PostgreSQL using this +Set this parameter to have +pgloader+ connects to PostgreSQL using this datestyle setting. + This parameter is optional and has no default value, thus pgloader will use whatever your PostgreSQL is configured to as default. ++ +As of +pgloader 2.3.3+ you can also use +pg_option_datestyle+ which is +the more general approach. + +pg_option_:: + + Replace with any option you're allowed to setup for the session + only with the +SET+ command, and +pgloader+ will do just that for + you. Consider for example +pg_option_standard_conforming_strings = on+. copy_every:: + @@ -436,19 +456,30 @@ This parameter is optional and defaults to pipe char +$$'|'$$+. client_encoding:: + -Set this parameter to have pgloader connects to PostgreSQL using this +Set this parameter to have +pgloader+ connects to PostgreSQL using this encoding. + -This parameter is optional and defaults to 'latin9'. If defined on a -table level, this local value will overwrite the global one. +This parameter is optional and defaults to 'latin9'. ++ +As of +pgloader 2.3.3+ you can also use +pg_option_client_encoding+ which is +the more general approach. datestyle:: + -Set this parameter to have pgloader connects to PostgreSQL using this -+datestyle+ setting. +Set this parameter to have +pgloader+ connects to PostgreSQL using this +datestyle setting. + -This parameter is optional and has no default. If defined on a table -level, this local value will overwrite the global one. +This parameter is optional and has no default value, thus pgloader will +use whatever your PostgreSQL is configured to as default. ++ +As of +pgloader 2.3.3+ you can also use +pg_option_datestyle+ which is +the more general approach. + +pg_option_:: + + Replace with any option you're allowed to setup for the session + only with the +SET+ command, and +pgloader+ will do just that for + you. Consider for example +pg_option_standard_conforming_strings = on+. null:: + diff --git a/pgloader.py b/pgloader.py index 77cae58..8edadb6 100755 --- a/pgloader.py +++ b/pgloader.py @@ -111,6 +111,9 @@ def parse_options(): default = None, help = "input files encoding") + parser.add_option("-o", "--pg-options", dest = "pg_options", action = "append", + help = "list of PostgreSQL options you want to SET") + parser.add_option("-t", "--section-threads", dest = "section_threads", default = pgloader.options.SECTION_THREADS, type = "int", @@ -252,6 +255,18 @@ def parse_options(): elif opts.quiet: pgloader.options.CLIENT_MIN_MESSAGES = logging.ERROR + if opts.pg_options: + pgloader.options.PG_OPTIONS = {} + for o in opts.pg_options: + try: + n, v = [x.strip() for x in o.split('=')] + if v == "": + raise ValueError + pgloader.options.PG_OPTIONS[n] = v + except ValueError, e: + print >>sys.stderr, \ + "Error: PostgreSQL options must have the form 'name=value'" + sys.exit(1) if opts.psycopg1: pgloader.options.PSYCOPG_VERSION = 1 @@ -285,6 +300,7 @@ def parse_config(conffile): from pgloader.options import DRY_RUN, VERBOSE, DEBUG, PEDANTIC from pgloader.options import NULL, EMPTY_STRING from pgloader.options import CLIENT_MIN_MESSAGES, LOG_FILE + from pgloader.options import PG_OPTIONS from pgloader.tools import check_dirname # first read the logging configuration @@ -605,6 +621,8 @@ def load_data(): started[s] .set() finished[s].set() log.error(e) + if DEBUG: + raise except IOError, e: # No space left on device? can't log it @@ -632,6 +650,9 @@ def load_data(): else: log.error('%s' % e) + if DEBUG: + raise + if PEDANTIC: # was: threads[s].print_stats() # but now thread[s] is no more alive @@ -679,6 +700,10 @@ if __name__ == "__main__": try: ret = load_data() except Exception, e: + from pgloader.options import DEBUG + print DEBUG + if DEBUG: + raise sys.stderr.write(str(e) + '\n') sys.exit(1) diff --git a/pgloader/db.py b/pgloader/db.py index 1b7421c..e58c66f 100644 --- a/pgloader/db.py +++ b/pgloader/db.py @@ -11,6 +11,7 @@ from options import TRUNCATE, VACUUM from options import INPUT_ENCODING, PG_CLIENT_ENCODING, DATESTYLE from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING from options import PSYCOPG_VERSION +from options import PG_OPTIONS from tools import PGLoader_Error from logger import log @@ -66,11 +67,9 @@ class db: self.copy_sep = COPY_SEP self.copy_every = copy_every self.commit_every = commit_every - self.client_encoding = client_encoding - self.datestyle = DATESTYLE self.null = NULL self.empty_string = EMPTY_STRING - self.lc_messages = None + self.pg_options = {} # this allows to specify configuration has columns = * # when true, we don't include column list in COPY statements @@ -107,51 +106,18 @@ class db: pass self.dbconn = None - def set_encoding(self): - """ set connection encoding to self.client_encoding """ - # debug only cause reconnecting happens on every - # configured section - self.log.debug('Setting client encoding to %s', self.client_encoding) + def set_pg_options(self): + """ set pg_options """ + for opt, val in self.pg_options.items(): + self.log.debug('Setting %s to %s', opt, val) - sql = 'set session client_encoding to %s' - cursor = self.dbconn.cursor() - try: - cursor.execute(sql, [self.client_encoding]) - except psycopg.ProgrammingError, e: - raise PGLoader_Error, e - cursor.close() - - def set_datestyle(self): - """ set session datestyle to self.datestyle """ - - if self.datestyle is None: - return - - # debug only cause reconnecting happens on every - # configured section - self.log.debug('Setting datestyle to %s', self.datestyle) - - sql = 'set session datestyle to %s' - cursor = self.dbconn.cursor() - cursor.execute(sql, [self.datestyle]) - cursor.close() - - def set_lc_messages(self): - """ set lc_messages to self.lc_messages """ - if self.lc_messages is None: - return - - # debug only cause reconnecting happens on every - # configured section - self.log.debug('Setting lc_messages to %s', self.lc_messages) - - sql = 'set session lc_messages to %s' - cursor = self.dbconn.cursor() - try: - cursor.execute(sql, [self.lc_messages]) - except psycopg.ProgrammingError, e: - raise PGLoader_Error, e - cursor.close() + sql = 'set session %s to %%s' % opt + cursor = self.dbconn.cursor() + try: + cursor.execute(sql, [val]) + except (psycopg.ProgrammingError, psycopg.DataError), e: + raise PGLoader_Error, e + cursor.close() def get_all_columns(self, tablename): """ select the columns name list from catalog """ @@ -214,9 +180,7 @@ ORDER BY attnum self.log.debug('Debug: connecting to dns %s', self.dsn) self.dbconn = psycopg.connect(self.dsn) - self.set_encoding() - self.set_datestyle() - self.set_lc_messages() + self.set_pg_options() except psycopg.OperationalError, e: # e.g. too many connections diff --git a/pgloader/options.py b/pgloader/options.py index 1b3e3f5..229dac9 100644 --- a/pgloader/options.py +++ b/pgloader/options.py @@ -8,6 +8,7 @@ PSYCOPG_VERSION = None INPUT_ENCODING = None PG_CLIENT_ENCODING = 'latin9' +PG_OPTIONS = None DATESTYLE = None COPY_SEP = None diff --git a/pgloader/pgloader.py b/pgloader/pgloader.py index 2364f73..5f4da44 100644 --- a/pgloader/pgloader.py +++ b/pgloader/pgloader.py @@ -18,6 +18,7 @@ from options import TRUNCATE, VACUUM, TRIGGERS from options import COUNT, FROM_COUNT, FROM_ID from options import INPUT_ENCODING, PG_CLIENT_ENCODING from options import COPY_SEP, FIELD_SEP, CLOB_SEP, NULL, EMPTY_STRING +from options import PG_OPTIONS from options import NEWLINE_ESCAPES from options import UDC_PREFIX from options import REFORMAT_PATH @@ -163,17 +164,15 @@ class PGLoader(threading.Thread): config.get(section, 'pass'), connect = False) - if config.has_option(section, 'client_encoding'): - self.db.client_encoding = parse_config_string( - config.get(section, 'client_encoding')) + for opt in ['client_encoding', 'datestyle', 'lc_messages']: + if config.has_option(section, opt): + self.db.pg_options[opt] = \ + parse_config_string(config.get(section, opt)) - if config.has_option(section, 'lc_messages'): - self.db.lc_messages = parse_config_string( - config.get(section, 'lc_messages')) - - if config.has_option(section, 'datestyle'): - self.db.datestyle = parse_config_string( - config.get(section, 'datestyle')) + # PostgreSQL options + from tools import parse_pg_options + parse_pg_options(self.log, config, section, self.db.pg_options) + self.log.debug("_dbconfig: %s" % str(self.db.pg_options)) if config.has_option(section, 'copy_every'): self.db.copy_every = config.getint(section, 'copy_every') @@ -260,29 +259,28 @@ class PGLoader(threading.Thread): # needed to instanciate self.reject while in template section self.reject = None - # optionnal local option client_encoding - if config.has_option(name, 'client_encoding'): - self.db.client_encoding = parse_config_string( - config.get(name, 'client_encoding')) - - if not DRY_RUN: - self.log.debug("client_encoding: '%s'", self.db.client_encoding) - # optionnal local option input_encoding self.input_encoding = INPUT_ENCODING if config.has_option(name, 'input_encoding'): self.input_encoding = parse_config_string( config.get(name, 'input_encoding')) - self.log.debug("input_encoding: '%s'", self.input_encoding) - # optionnal local option datestyle - if not DRY_RUN and config.has_option(name, 'datestyle'): - self.db.datestyle = parse_config_string( - config.get(name, 'datestyle')) + # optionnal local option client_encoding and datestyle + for opt in ['client_encoding', 'datestyle']: + if config.has_option(name, opt): + self.db.pg_options[opt] = parse_config_string(config.get(name, opt)) - if not DRY_RUN: - self.log.debug("datestyle: '%s'", self.db.datestyle) + if not DRY_RUN: + self.log.debug("%s: '%s'", opt, self.db.pg_options[opt]) + + # optionnal local pg_options + # precedence is given to command line parsing, which is in PG_OPTIONS + from tools import parse_pg_options + parse_pg_options(log, config, name, self.db.pg_options, overwrite=True) + if not self.template: + if PG_OPTIONS: + self.db.pg_options.update(PG_OPTIONS) ## # data filename @@ -849,6 +847,7 @@ class PGLoader(threading.Thread): except Exception, e: # resources get freed in self.terminate() + self.terminate() self.log.error(e) raise diff --git a/pgloader/tools.py b/pgloader/tools.py index 1749135..c9ad98c 100644 --- a/pgloader/tools.py +++ b/pgloader/tools.py @@ -139,7 +139,25 @@ def parse_config_string(str): return str - + +def parse_pg_options(log, config, section, pg_options, overwrite=False): + """ Get all the pg_options_ prefixed options from the section""" + # PostgreSQL options must begin with the prefix pg_option_ + for o in [x for x in config.options(section) + if x.startswith('pg_option_')]: + opt = o[len('pg_option_'):] + val = config.get(section, o) + + # hysterical raisins + for compat in ['client_encoding', 'lc_messages', 'datestyle']: + if opt == compat and config.has_option(section, compat): + log.warning("Ignoring %s.%s for %s.%s" \ + % (section, o, section, opt)) + + if opt not in compat and (overwrite or opt not in pg_options): + pg_options[opt] = val + + return pg_options def read_path(strpath, log, path = [], check = True): """ read a path configuration element, discarding non-existing entries """