mirror of
https://github.com/dimitri/pgloader.git
synced 2025-08-08 15:27:00 +02:00
Synchronize threads starts and ends with an Event
This commit is contained in:
parent
45c670f576
commit
a27a801f62
31
pgloader.py
31
pgloader.py
@ -463,6 +463,8 @@ def load_data():
|
||||
sections.sort()
|
||||
|
||||
threads = {}
|
||||
started = {}
|
||||
finished = {}
|
||||
current = 0
|
||||
interrupted = False
|
||||
|
||||
@ -477,7 +479,11 @@ def load_data():
|
||||
|
||||
try:
|
||||
summary [s] = []
|
||||
loader = PGLoader(s, config, sem, summary[s])
|
||||
started [s] = threading.Event()
|
||||
finished[s] = threading.Event()
|
||||
|
||||
loader = PGLoader(s, config, sem,
|
||||
(started[s], finished[s]), summary[s])
|
||||
if not loader.template:
|
||||
filename = loader.filename
|
||||
input_encoding = loader.input_encoding
|
||||
@ -485,11 +491,13 @@ def load_data():
|
||||
|
||||
# .start() will sem.aquire(), so we won't have more
|
||||
# than max_running threads running at any time.
|
||||
log.info("Starting thread for %s" % s)
|
||||
log.debug("Starting thread for %s" % s)
|
||||
threads[s].start()
|
||||
else:
|
||||
log.info("Skipping section %s, which is a template" % s)
|
||||
summary.pop(s)
|
||||
|
||||
for d in (summary, started, finished):
|
||||
d.pop(s)
|
||||
|
||||
except PGLoader_Error, e:
|
||||
if e == '':
|
||||
@ -512,19 +520,12 @@ def load_data():
|
||||
|
||||
current += 1
|
||||
|
||||
if not interrupted:
|
||||
from pgloader.tools import running_threads
|
||||
# get sure each thread is started, then each one is done
|
||||
from pgloader.tools import check_events
|
||||
|
||||
n = running_threads(threads)
|
||||
log.info("Waiting for %d threads to terminate" % n)
|
||||
time.sleep(2)
|
||||
|
||||
# Try to acquire all semaphore entries
|
||||
for i in range(max_running):
|
||||
sem.acquire()
|
||||
log.debug("Acquired %d times, " % (i+1) + \
|
||||
"still waiting for %d threads to terminate" \
|
||||
% running_threads(threads))
|
||||
check_events(started, log, "is running")
|
||||
log.info("All threads are started, wait for them to terminate")
|
||||
check_events(finished, log, "processing is over")
|
||||
|
||||
# total duration
|
||||
td = time.time() - begin
|
||||
|
@ -9,7 +9,7 @@ import os, sys, os.path, time, codecs, threading
|
||||
from cStringIO import StringIO
|
||||
|
||||
from logger import log, getLogger
|
||||
from tools import PGLoader_Error, Reject, parse_config_string
|
||||
from tools import PGLoader_Error, Reject, parse_config_string, check_events
|
||||
from db import db
|
||||
from lo import ifx_clob, ifx_blob
|
||||
|
||||
@ -31,7 +31,8 @@ class PGLoader(threading.Thread):
|
||||
import data with COPY or update blob data with UPDATE.
|
||||
"""
|
||||
|
||||
def __init__(self, name, config, sem, stats,
|
||||
def __init__(self, name, config, sem, (started, finished),
|
||||
stats,
|
||||
logname = None,
|
||||
reject = None, queue = None, lock = None, copy_sep = None):
|
||||
""" Init with a configuration section """
|
||||
@ -47,13 +48,17 @@ class PGLoader(threading.Thread):
|
||||
|
||||
# sem, stats and queue (if not None) are global objects
|
||||
# sem is shared by all threads at the same level
|
||||
# started is a threads.Event object to set() once in run()
|
||||
# finished is a threads.Event object to set() once processing is over
|
||||
# stats is a private entry of a shared dict
|
||||
# queue is given when reading in round robin mode
|
||||
# lock is a threading.Lock for reading sync
|
||||
# reject is a common reject object, protected by a parent-thread rlock
|
||||
# reject is the common reject object
|
||||
#
|
||||
self.sem = sem
|
||||
self.started = started
|
||||
self.stats = stats
|
||||
self.finished = finished
|
||||
self.queue = queue
|
||||
self.lock = lock
|
||||
self.reject = reject
|
||||
@ -86,10 +91,10 @@ class PGLoader(threading.Thread):
|
||||
|
||||
if config.has_option(name, 'template'):
|
||||
self.template = True
|
||||
self.log.info("[%s] is a template", self.name)
|
||||
self.log.debug("[%s] is a template", self.name)
|
||||
|
||||
if not self.template:
|
||||
self.log.info("[%s] parse configuration", self.name)
|
||||
self.log.debug("[%s] parse configuration", self.name)
|
||||
|
||||
if not self.template:
|
||||
# check if the section wants to use a template
|
||||
@ -108,7 +113,7 @@ class PGLoader(threading.Thread):
|
||||
raise PGLoader_Error, m
|
||||
|
||||
# first load template configuration
|
||||
self.log.info("Reading configuration from template " +\
|
||||
self.log.debug("Reading configuration from template " +\
|
||||
"section [%s]", self.template)
|
||||
|
||||
self.real_log = self.log
|
||||
@ -129,7 +134,7 @@ class PGLoader(threading.Thread):
|
||||
|
||||
if not self.template:
|
||||
# now load specific configuration
|
||||
self.log.info("Reading configuration from section [%s]", name)
|
||||
self.log.debug("Reading configuration from section [%s]", name)
|
||||
self._read_conf(name, config, db)
|
||||
|
||||
# Now reset database connection
|
||||
@ -237,8 +242,8 @@ class PGLoader(threading.Thread):
|
||||
self.reject = Reject(self.log,
|
||||
self.reject_log, self.reject_data)
|
||||
|
||||
self.log.info('reject log in %s', self.reject.reject_log)
|
||||
self.log.info('rejected data in %s', self.reject.reject_data)
|
||||
self.log.debug('reject log in %s', self.reject.reject_log)
|
||||
self.log.debug('rejected data in %s', self.reject.reject_data)
|
||||
|
||||
else:
|
||||
# needed to instanciate self.reject while in template section
|
||||
@ -551,7 +556,7 @@ class PGLoader(threading.Thread):
|
||||
if 'format' not in self.__dict__:
|
||||
raise PGLoader_Error, "Please configure %s.format" % name
|
||||
|
||||
self.log.info("File '%s' will be read in %s format" \
|
||||
self.log.debug("File '%s' will be read in %s format" \
|
||||
% (self.filename, self.format))
|
||||
|
||||
if self.format.lower() == 'csv':
|
||||
@ -773,8 +778,11 @@ class PGLoader(threading.Thread):
|
||||
# care about number of threads launched
|
||||
self.sem.acquire()
|
||||
|
||||
# tell parent thread we are running now
|
||||
self.started.set()
|
||||
|
||||
# Announce the beginning of the work
|
||||
self.log.info("%s launched" % self.logname)
|
||||
self.log.debug("%s processing" % self.logname)
|
||||
|
||||
if self.section_threads == 1:
|
||||
if 'reader' in self.__dict__ and self.reader.start is not None:
|
||||
@ -789,9 +797,7 @@ class PGLoader(threading.Thread):
|
||||
except Exception, e:
|
||||
self.log.error(e)
|
||||
|
||||
self.log.info("Releasing %s" % self.logname)
|
||||
self.sem.release()
|
||||
|
||||
self.terminate()
|
||||
return
|
||||
|
||||
# Mutli-Threaded processing of current section
|
||||
@ -809,26 +815,19 @@ class PGLoader(threading.Thread):
|
||||
# here we need a special thread reading the file
|
||||
self.round_robin_read()
|
||||
|
||||
self.log.info("releasing %s" % self.logname)
|
||||
self.sem.release()
|
||||
self.terminate()
|
||||
return
|
||||
|
||||
def wait_for_workers(self, sem, workers):
|
||||
"""
|
||||
Try to acquire all semaphore entries --- success means no
|
||||
more thread is running
|
||||
"""
|
||||
from tools import running_threads
|
||||
def terminate(self):
|
||||
""" Announce it's over and free the concurrency control semaphore """
|
||||
|
||||
n = running_threads(workers)
|
||||
self.log.info("Waiting for %d/%d threads to terminate" \
|
||||
% (n, len(workers)))
|
||||
self.log.debug("releasing %s semaphore" % self.logname)
|
||||
self.sem.release()
|
||||
|
||||
# tell parent thread processing is now over, here
|
||||
self.log.debug("Announce it's over")
|
||||
self.finished.set()
|
||||
|
||||
for i in range(n):
|
||||
sem.acquire()
|
||||
self.log.debug("Acquired %d times, " % (i+1) + \
|
||||
"still waiting for %d threads to terminate" \
|
||||
% running_threads(workers))
|
||||
return
|
||||
|
||||
def split_file_read(self):
|
||||
@ -874,13 +873,18 @@ class PGLoader(threading.Thread):
|
||||
sem = threading.BoundedSemaphore(self.section_threads)
|
||||
summary = {}
|
||||
threads = {}
|
||||
started = {}
|
||||
finished = {}
|
||||
|
||||
for current in range(self.section_threads):
|
||||
try:
|
||||
summary[current] = []
|
||||
current_name = "%s[%d]" % (self.name, current)
|
||||
started[current] = threading.Event()
|
||||
finished[current] = threading.Event()
|
||||
current_name = "%s.%d" % (self.name, current)
|
||||
|
||||
loader = PGLoader(self.name, self.config, sem,
|
||||
(started[current], finished[current]),
|
||||
summary[current],
|
||||
logname = current_name,
|
||||
reject = self.reject)
|
||||
@ -895,14 +899,13 @@ class PGLoader(threading.Thread):
|
||||
except Exception, e:
|
||||
raise
|
||||
|
||||
# wait for loaders completion, first let them some time to
|
||||
# be started
|
||||
time.sleep(2)
|
||||
# wait for workers to have started, then wait for them to terminate
|
||||
check_events(started, self.log, "is running")
|
||||
check_events(finished, self.log, "processing is over")
|
||||
|
||||
self.wait_for_workers(sem, threads)
|
||||
self.finish_processing()
|
||||
self.duration = time.time() - self.init_time
|
||||
self.log.info('No more threads are running, %s done' % self.name)
|
||||
self.log.debug('No more threads are running, %s done' % self.name)
|
||||
|
||||
stats = [0, 0]
|
||||
for s in summary:
|
||||
@ -926,6 +929,8 @@ class PGLoader(threading.Thread):
|
||||
sem = threading.BoundedSemaphore(self.section_threads)
|
||||
summary = {}
|
||||
threads = {}
|
||||
started = {}
|
||||
finished = {}
|
||||
|
||||
for current in range(self.section_threads):
|
||||
queues[current] = RRReader()
|
||||
@ -938,9 +943,12 @@ class PGLoader(threading.Thread):
|
||||
|
||||
try:
|
||||
summary [current] = []
|
||||
current_name = "%s[%d]" % (self.name, current)
|
||||
started [current] = threading.Event()
|
||||
finished[current] = threading.Event()
|
||||
current_name = "%s.%d" % (self.name, current)
|
||||
|
||||
loader = PGLoader(self.name, self.config, sem,
|
||||
(started[current], finished[current]),
|
||||
summary[current],
|
||||
logname = current_name,
|
||||
reject = self.reject,
|
||||
@ -961,12 +969,12 @@ class PGLoader(threading.Thread):
|
||||
|
||||
if len(threads) != self.section_threads:
|
||||
self.log.error("Couldn't start all threads, check previous errors")
|
||||
self.wait_for_workers(sem, threads)
|
||||
|
||||
check_events([x for x in finished if threads[x].isAlive()],
|
||||
self.log, "processing is over")
|
||||
return
|
||||
|
||||
# wait for loaders completion, first let them some time to
|
||||
# be started
|
||||
time.sleep(2)
|
||||
check_events(started, self.log, "is running")
|
||||
|
||||
# Now self.section_threads are started and we have a queue and
|
||||
# a Condition for each of them.
|
||||
@ -1002,17 +1010,19 @@ class PGLoader(threading.Thread):
|
||||
c = (n / self.rrqueue_size) % self.section_threads
|
||||
|
||||
# we could have some locks to release here
|
||||
self.log.debug("p=%d c=%d n=%d (n/rrqueue_size)%%%d=%d (n+1/rrqueue_size)%%%d=%d" \
|
||||
self.log.debug("p=%d c=%d n=%d (n/rrqueue_size)%%%d=%d " \
|
||||
% (p, c, n,
|
||||
self.section_threads,
|
||||
(n/self.rrqueue_size) % self.section_threads,
|
||||
self.section_threads,
|
||||
(n/self.rrqueue_size) % self.section_threads) + \
|
||||
"(n+1/rrqueue_size)%%%d=%d" \
|
||||
% (self.section_threads,
|
||||
((n+1)/self.rrqueue_size) % self.section_threads))
|
||||
|
||||
if p != c or (n % self.rrqueue_size != 0):
|
||||
self.log.debug("locks[%d].release" % p)
|
||||
locks[p].release()
|
||||
|
||||
# mark all worker threads has done
|
||||
k = threads.keys()
|
||||
for c in range(self.section_threads):
|
||||
self.log.debug("locks[%d].acquire to set %s.done = True" \
|
||||
@ -1024,10 +1034,12 @@ class PGLoader(threading.Thread):
|
||||
self.log.debug("locks[%d].release (done set)" % c)
|
||||
locks[c].release()
|
||||
|
||||
self.wait_for_workers(sem, threads)
|
||||
# wait for workers to finish processing
|
||||
check_events(finished, self.log, "processing is over")
|
||||
|
||||
self.finish_processing()
|
||||
self.duration = time.time() - self.init_time
|
||||
self.log.info('%s done' % self.name)
|
||||
self.log.debug('%s done' % self.name)
|
||||
|
||||
stats = [0, 0]
|
||||
for s in summary:
|
||||
@ -1080,7 +1092,7 @@ class PGLoader(threading.Thread):
|
||||
if TRIGGERS and not DRY_RUN:
|
||||
self.db.enable_triggers(self.table)
|
||||
|
||||
self.log.info("loading done")
|
||||
self.log.debug("loading done")
|
||||
return
|
||||
|
||||
def update_summary(self):
|
||||
|
@ -187,16 +187,13 @@ def check_dirname(path):
|
||||
return True, None
|
||||
|
||||
|
||||
def check_events(events, log, context = "is running"):
|
||||
""" wait until all events (list) are set """
|
||||
for t in events:
|
||||
events[t].wait()
|
||||
log.debug("thread %s %s" % (t, context))
|
||||
|
||||
def running_threads(threads):
|
||||
""" count running threads """
|
||||
running = 0
|
||||
for s in threads:
|
||||
if threads[s].isAlive():
|
||||
running += 1
|
||||
|
||||
return running
|
||||
|
||||
return
|
||||
|
||||
class RRReader(collections.deque):
|
||||
""" Round Robin reader, which are collections.deque with a
|
||||
|
Loading…
Reference in New Issue
Block a user