mirror of
https://github.com/mozilla-services/syncstorage-rs.git
synced 2025-08-06 20:06:57 +02:00
refactor: remove user migration utils (#1710)
Some checks failed
Glean probe-scraper / glean-probe-scraper (push) Has been cancelled
Some checks failed
Glean probe-scraper / glean-probe-scraper (push) Has been cancelled
This commit is contained in:
parent
1dc421474d
commit
f01c21fef4
@ -4,8 +4,6 @@ See each directory for details:
|
|||||||
|
|
||||||
* [hawk](hawk) - a tool for generating test HAWK authorization headers
|
* [hawk](hawk) - a tool for generating test HAWK authorization headers
|
||||||
* [spanner](spanner) - Google Cloud Platform Spanner tools for maintenance and testing
|
* [spanner](spanner) - Google Cloud Platform Spanner tools for maintenance and testing
|
||||||
* [user_migration](user_migration) - scripts for dumping and moving user data from SQL to Spanner
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
These tools are mostly written in python. It is recommended that you create a commonly shared virtual environment using something like:
|
These tools are mostly written in python. It is recommended that you create a commonly shared virtual environment using something like:
|
||||||
|
@ -1,100 +0,0 @@
|
|||||||
# User Migration Script
|
|
||||||
|
|
||||||
This is a workspace for testing user migration from the old databases
|
|
||||||
to the new durable one.
|
|
||||||
|
|
||||||
There are several candidate scripts that you can use.
|
|
||||||
|
|
||||||
These progress off of each other in order to provide cached results.
|
|
||||||
|
|
||||||
There are a few base files you'll want to declare:
|
|
||||||
|
|
||||||
* *dsns* - a file containing the mysql and spanner DSNs for the users.
|
|
||||||
Each DSN should be on a single line. Currently only one DSN of a
|
|
||||||
given type is permitted.
|
|
||||||
|
|
||||||
(e.g.)
|
|
||||||
|
|
||||||
```text
|
|
||||||
mysql://test:test@localhost/syncstorage
|
|
||||||
spanner://projects/sync-spanner-dev-225401/instances/spanner-test/databases/sync_schema3
|
|
||||||
```
|
|
||||||
|
|
||||||
* *users.csv* - a mysql dump of the token database. This file is only needed if the `--deanon` de-anonymization flag is set. By default, data is anononymized to prevent accidental movement.
|
|
||||||
You can produce this file from the following:
|
|
||||||
```bash
|
|
||||||
mysql -e "select uid, email, generation, keys_changed_at, \
|
|
||||||
client_state from users;" > users.csv`
|
|
||||||
```
|
|
||||||
The script will automatically skip the title row, and presumes that fields are tab separated.
|
|
||||||
|
|
||||||
|
|
||||||
With those files you can now run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gen_fxa_users.py
|
|
||||||
```
|
|
||||||
which will take the `users.csv` raw data and generate a
|
|
||||||
`fxa_users_{date}.lst` file.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gen_bso_users.py --bso_num #
|
|
||||||
```
|
|
||||||
which will automatically read in the `fxa_users_{date}.lst` file,
|
|
||||||
connect to the mysql database, and geneate a list of sorted users
|
|
||||||
taken from the `bso#` table. This will create the
|
|
||||||
`bso_users_{bso_num}_{date}.lst` file
|
|
||||||
|
|
||||||
and finally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
GOOGLE_APPLICATION_CREDENTIALS=credentials.json migrate_node.py \
|
|
||||||
[--start_bso=0] \
|
|
||||||
[--end_bso=19] \
|
|
||||||
[--user_percent 1:100]
|
|
||||||
```
|
|
||||||
|
|
||||||
Which will read the `bso_users_#_{date}.lst` files and move the users
|
|
||||||
based on `--user_percent`
|
|
||||||
|
|
||||||
More importantly `--help` is your friend. feel free to use liberally.
|
|
||||||
|
|
||||||
## installation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
virtualenv venv && venv/bin/pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
## running
|
|
||||||
|
|
||||||
Since you will be connecting to the GCP Spanner API, you will need to have set the `GOOGLE_APPLICATION_CREDENTIALS` env var before running these scripts. This environment variable should point to the exported Google Credentials acquired from the GCP console.
|
|
||||||
|
|
||||||
The scripts will take the following actions:
|
|
||||||
|
|
||||||
1. fetch all users from a given node.
|
|
||||||
1. compare and port all user_collections over (NOTE: this may involve remapping collecitonid values.)
|
|
||||||
1. begin copying over user information from mysql to spanner.
|
|
||||||
|
|
||||||
Overall performance may be improved by "batching" BSOs to different
|
|
||||||
processes using:
|
|
||||||
|
|
||||||
`--start_bso` the BSO database (defaults to 0, inclusive) to begin
|
|
||||||
copying from
|
|
||||||
|
|
||||||
`--end_bso` the final BSO database (defaults to 19, inclusive) to copy
|
|
||||||
from.
|
|
||||||
|
|
||||||
Note that these are inclusive values. So to split between two
|
|
||||||
processes, you would want to use
|
|
||||||
|
|
||||||
```bash
|
|
||||||
migrate_node.py --start_bso=0 --end_bso=9 &
|
|
||||||
migrate_node.py --start_bso=10 --end_bso=19 &
|
|
||||||
```
|
|
||||||
|
|
||||||
(As short hand for this case, you could also do:
|
|
||||||
```
|
|
||||||
migrate_node.py --end_bso=9 &
|
|
||||||
migrate_node.py --start_bso=10 &
|
|
||||||
```
|
|
||||||
and let the defaults handle the rest.)
|
|
@ -1,15 +0,0 @@
|
|||||||
INSERT IGNORE INTO weave0.collections (name, collectionid) VALUES
|
|
||||||
("clients", 1),
|
|
||||||
("crypto", 2),
|
|
||||||
("forms", 3),
|
|
||||||
("history", 4),
|
|
||||||
("keys", 5),
|
|
||||||
("meta", 6),
|
|
||||||
("bookmarks", 7),
|
|
||||||
("prefs", 8),
|
|
||||||
("tabs", 9),
|
|
||||||
("passwords", 10),
|
|
||||||
("addons", 11),
|
|
||||||
("addresses", 12),
|
|
||||||
("creditcards", 13),
|
|
||||||
("reserved", 99);
|
|
@ -1,290 +0,0 @@
|
|||||||
#! venv/bin/python
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from mysql import connector
|
|
||||||
try:
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
except ImportError:
|
|
||||||
from urlparse import urlparse
|
|
||||||
|
|
||||||
|
|
||||||
def tick(count):
|
|
||||||
mark = None
|
|
||||||
if count % 1000 == 0:
|
|
||||||
mark = "|"
|
|
||||||
elif count % 100 == 0:
|
|
||||||
mark = "."
|
|
||||||
level = logging.getLogger().getEffectiveLevel()
|
|
||||||
if mark and level > logging.DEBUG:
|
|
||||||
print(mark, end='', flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
class Report:
|
|
||||||
|
|
||||||
bso = "init"
|
|
||||||
_failure = None
|
|
||||||
_success = None
|
|
||||||
|
|
||||||
def __init__(self, args, lock=None):
|
|
||||||
self._success_file = args.success_file
|
|
||||||
self._failure_file = args.failure_file
|
|
||||||
self._lock = lock
|
|
||||||
|
|
||||||
def success(self, uid):
|
|
||||||
if self._lock:
|
|
||||||
lock = self._lock.acquire()
|
|
||||||
if not self._success:
|
|
||||||
self._success = open(self._success_file, "w")
|
|
||||||
self._success.write("{}\t{}\n".format(self.bso, uid))
|
|
||||||
|
|
||||||
def fail(self, uid, reason=None):
|
|
||||||
if self._lock:
|
|
||||||
lock = self._lock.acquire()
|
|
||||||
if not self._failure:
|
|
||||||
self._failure = open(self._failure_file, "w")
|
|
||||||
logging.debug("Skipping user {}".format(uid))
|
|
||||||
self._failure.write("{}\t{}\t{}\n".format(self.bso, uid, reason or ""))
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self._success.close()
|
|
||||||
self._failure.close()
|
|
||||||
|
|
||||||
|
|
||||||
class BSO_Users:
|
|
||||||
"""User information from Tokenserver database.
|
|
||||||
|
|
||||||
Can be constructed from
|
|
||||||
``mysql -e "select uid, email, generation, keys_changed_at, \
|
|
||||||
client_state from users;" > users.csv`
|
|
||||||
"""
|
|
||||||
users = {}
|
|
||||||
anon = False
|
|
||||||
|
|
||||||
def __init__(self, args, report, dsn):
|
|
||||||
self.args = args
|
|
||||||
self.dsn = dsn
|
|
||||||
self.report = report
|
|
||||||
self.get_users(args)
|
|
||||||
|
|
||||||
def get_users(self, args):
|
|
||||||
try:
|
|
||||||
logging.info("Reading fxa_user data.")
|
|
||||||
with open(args.fxa_users_file) as csv_file:
|
|
||||||
line = 0
|
|
||||||
for (uid, fxa_uid, fxa_kid) in csv.reader(
|
|
||||||
csv_file, delimiter="\t"
|
|
||||||
):
|
|
||||||
if uid == "uid":
|
|
||||||
continue
|
|
||||||
tick(line)
|
|
||||||
logging.debug("Read: {} {}:{}".format(
|
|
||||||
uid, fxa_uid, fxa_kid))
|
|
||||||
self.users[int(uid)] = (fxa_uid, fxa_kid)
|
|
||||||
line += 1
|
|
||||||
print("")
|
|
||||||
except Exception as ex:
|
|
||||||
logging.error(
|
|
||||||
"Unexpected error",
|
|
||||||
exc_info=ex
|
|
||||||
)
|
|
||||||
self.report.fail(uid, "Unexpected error {}".format(ex))
|
|
||||||
|
|
||||||
def run(self, bso_num):
|
|
||||||
connection = self.conf_mysql(self.dsn)
|
|
||||||
out_users = []
|
|
||||||
bso_file = self.args.output_file
|
|
||||||
bso_file = bso_file.replace("#", str(bso_num))
|
|
||||||
logging.info("Fetching users from BSO db into {}".format(
|
|
||||||
bso_file,
|
|
||||||
))
|
|
||||||
output_file = open(bso_file, "w")
|
|
||||||
try:
|
|
||||||
cursor = connection.cursor()
|
|
||||||
sql = ("""select userid, count(*) as count from bso{}"""
|
|
||||||
""" group by userid order by userid""".format(
|
|
||||||
bso_num))
|
|
||||||
if self.args.user_range:
|
|
||||||
(offset, limit) = self.args.user_range.split(':')
|
|
||||||
sql = "{} limit {} offset {}".format(
|
|
||||||
sql, limit, offset)
|
|
||||||
cursor.execute(sql)
|
|
||||||
for (uid, count) in cursor:
|
|
||||||
try:
|
|
||||||
(fxa_uid, fxa_kid) = self.users.get(uid)
|
|
||||||
if self.args.hoard_limit and count > self.args.hoard_limit:
|
|
||||||
logging.warn(
|
|
||||||
"User {} => {}:{} has too "
|
|
||||||
"many items: {} ".format(
|
|
||||||
uid, fxa_uid, fxa_kid, count
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.report.fail(uid, "hoarder {}".format(count))
|
|
||||||
continue
|
|
||||||
out_users.append((uid, fxa_uid, fxa_kid))
|
|
||||||
except TypeError:
|
|
||||||
self.report.fail(uid, "not found")
|
|
||||||
logging.error(
|
|
||||||
("User {} not found in "
|
|
||||||
"tokenserver data".format(uid)))
|
|
||||||
if self.args.sort_users:
|
|
||||||
logging.info("Sorting users...")
|
|
||||||
out_users.sort(key=lambda tup: tup[1])
|
|
||||||
# Take a block of percentage of the users.
|
|
||||||
logging.info("Writing out {} users".format(len(out_users)))
|
|
||||||
line = 0
|
|
||||||
output_file.write("uid\tfxa_uid\tfxa_kid\n")
|
|
||||||
for (uid, fxa_uid, fxa_kid) in out_users:
|
|
||||||
output_file.write("{}\t{}\t{}\n".format(
|
|
||||||
uid, fxa_uid, fxa_kid
|
|
||||||
))
|
|
||||||
tick(line)
|
|
||||||
line += 1
|
|
||||||
output_file.flush()
|
|
||||||
print("")
|
|
||||||
except connector.errors.ProgrammingError as ex:
|
|
||||||
logging.error(ex)
|
|
||||||
output_file.close()
|
|
||||||
os.unlink(bso_file)
|
|
||||||
except Exception as e:
|
|
||||||
logging.error("### Exception {}:{}", exc_info=e)
|
|
||||||
output_file.close()
|
|
||||||
os.unlink(bso_file)
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
def conf_mysql(self, dsn):
|
|
||||||
"""create a connection to the original storage system """
|
|
||||||
logging.debug("Configuring MYSQL: {}".format(dsn))
|
|
||||||
return connector.connect(
|
|
||||||
user=dsn.username,
|
|
||||||
password=dsn.password,
|
|
||||||
host=dsn.hostname,
|
|
||||||
port=dsn.port or 3306,
|
|
||||||
database=dsn.path[1:]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
pid = os.getpid()
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Generate BSO user list")
|
|
||||||
parser.add_argument(
|
|
||||||
'--dsns', default="move_dsns.lst",
|
|
||||||
help="file of new line separated DSNs")
|
|
||||||
parser.add_argument(
|
|
||||||
'--start_bso',
|
|
||||||
default=0,
|
|
||||||
help="Start of BSO range (default 0)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--end_bso',
|
|
||||||
default=19,
|
|
||||||
help="End of BSO range inclusive (default 19)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--bso_num',
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="Only read from this bso (default num)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--output_file',
|
|
||||||
default="bso_users_#_{}.lst".format(
|
|
||||||
datetime.now().strftime("%Y_%m_%d")),
|
|
||||||
help="List of BSO users."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--verbose',
|
|
||||||
action="store_true",
|
|
||||||
help="verbose logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--quiet',
|
|
||||||
action="store_true",
|
|
||||||
help="silence logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--user_range',
|
|
||||||
help="Range of users to extract (offset:limit)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--hoard_limit', type=int, default=0,
|
|
||||||
help="reject any user with more than this count of records"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--sort_users', action="store_true",
|
|
||||||
help="Sort the user"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--success_file', default="success_bso_user.log".format(pid),
|
|
||||||
help="File of successfully migrated userids"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--failure_file', default="failure_bso_user.log".format(pid),
|
|
||||||
help="File of unsuccessfully migrated userids"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--fxa_users_file',
|
|
||||||
default="fxa_users_{}.lst".format(datetime.now().strftime("%Y_%m_%d")),
|
|
||||||
help="List of pre-generated FxA users."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--threading',
|
|
||||||
action="store_true",
|
|
||||||
help="use threading"
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
threads = []
|
|
||||||
args = get_args()
|
|
||||||
log_level = logging.INFO
|
|
||||||
if args.quiet:
|
|
||||||
log_level = logging.ERROR
|
|
||||||
if args.verbose:
|
|
||||||
log_level = logging.DEBUG
|
|
||||||
logging.basicConfig(
|
|
||||||
stream=sys.stdout,
|
|
||||||
level=log_level,
|
|
||||||
)
|
|
||||||
if args.bso_num is not None:
|
|
||||||
args.start_bso = args.end_bso = args.bso_num
|
|
||||||
locker = None
|
|
||||||
if args.threading:
|
|
||||||
locker = threading.Lock()
|
|
||||||
report = Report(args, locker)
|
|
||||||
dsns = open(args.dsns).readlines()
|
|
||||||
db_dsn = None
|
|
||||||
for line in dsns:
|
|
||||||
dsn = urlparse(line.strip())
|
|
||||||
if 'mysql' in dsn.scheme:
|
|
||||||
db_dsn = dsn
|
|
||||||
|
|
||||||
if not db_dsn:
|
|
||||||
RuntimeError("mysql dsn must be specified")
|
|
||||||
|
|
||||||
bso = BSO_Users(args, report, db_dsn)
|
|
||||||
# threading is currently in process.
|
|
||||||
if args.threading:
|
|
||||||
for bso_num in range(int(args.start_bso), int(args.end_bso) + 1):
|
|
||||||
t = threading.Thread(target=bso.run, args=(bso_num,))
|
|
||||||
threads.append(t)
|
|
||||||
t.start()
|
|
||||||
else:
|
|
||||||
bso.run(args.bso_num)
|
|
||||||
|
|
||||||
for thread in threads:
|
|
||||||
thread.join()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,203 +0,0 @@
|
|||||||
#! venv/bin/python
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import base64
|
|
||||||
import binascii
|
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
def tick(count):
|
|
||||||
mark = None
|
|
||||||
if count % 1000 == 0:
|
|
||||||
mark = "|"
|
|
||||||
elif count % 100 == 0:
|
|
||||||
mark = "."
|
|
||||||
level = logging.getLogger().getEffectiveLevel()
|
|
||||||
if mark and level > logging.DEBUG:
|
|
||||||
print(mark, end='', flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
class Report:
|
|
||||||
|
|
||||||
bso = "init"
|
|
||||||
_success = None
|
|
||||||
_failure = None
|
|
||||||
|
|
||||||
def __init__(self, args):
|
|
||||||
self._success_file = args.success_file
|
|
||||||
self._failure_file = args.failure_file
|
|
||||||
|
|
||||||
def success(self, uid):
|
|
||||||
if not self._success:
|
|
||||||
self._success = open(self._success_file, "w")
|
|
||||||
self._success.write("{}\t{}\n".format(self.bso, uid))
|
|
||||||
|
|
||||||
def fail(self, uid, reason=None):
|
|
||||||
if not self._failure:
|
|
||||||
self._failure = open(self._failure_file, "w")
|
|
||||||
logging.debug("Skipping user {}".format(uid))
|
|
||||||
self._failure.write("{}\t{}\t{}\n".format(self.bso, uid, reason or ""))
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self._success.close()
|
|
||||||
self._failure.close()
|
|
||||||
|
|
||||||
|
|
||||||
class FxA_Generate:
|
|
||||||
"""User information from Tokenserver database.
|
|
||||||
|
|
||||||
Can be constructed from
|
|
||||||
``mysql -e "select uid, email, generation, keys_changed_at, \
|
|
||||||
client_state from users;" > users.csv`
|
|
||||||
"""
|
|
||||||
users = []
|
|
||||||
anon = False
|
|
||||||
|
|
||||||
def __init__(self, args, report):
|
|
||||||
logging.info("Processing token file: {} into {}".format(
|
|
||||||
args.users_file,
|
|
||||||
args.output_file,
|
|
||||||
))
|
|
||||||
output_file = open(args.output_file, "w")
|
|
||||||
output_file.write("uid\tfxa_uid\tfxa_kid\n")
|
|
||||||
if not os.path.isfile(args.users_file):
|
|
||||||
raise IOError("{} not found".format(args.users_file))
|
|
||||||
with open(args.users_file) as csv_file:
|
|
||||||
try:
|
|
||||||
line = 0
|
|
||||||
success = 0
|
|
||||||
for (uid, email, generation,
|
|
||||||
keys_changed_at, client_state) in csv.reader(
|
|
||||||
csv_file, delimiter="\t"):
|
|
||||||
line += 1
|
|
||||||
if uid == 'uid':
|
|
||||||
# skip the header row.
|
|
||||||
continue
|
|
||||||
tick(line)
|
|
||||||
try:
|
|
||||||
fxa_uid = email.split('@')[0]
|
|
||||||
try:
|
|
||||||
keys_changed_at = int(keys_changed_at)
|
|
||||||
except ValueError:
|
|
||||||
keys_changed_at = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
generation = int(generation)
|
|
||||||
except ValueError:
|
|
||||||
generation = 0
|
|
||||||
|
|
||||||
if (keys_changed_at or generation) == 0:
|
|
||||||
logging.warn(
|
|
||||||
"user {} has no k_c_a or "
|
|
||||||
"generation value".format(
|
|
||||||
uid))
|
|
||||||
# trap for actually blank values
|
|
||||||
if client_state is None or client_state == '':
|
|
||||||
logging.error(
|
|
||||||
"User {} "
|
|
||||||
"has an invalid, empty client state".format(
|
|
||||||
uid
|
|
||||||
)
|
|
||||||
)
|
|
||||||
report.fail(uid, "invalid client state")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
client_state = binascii.unhexlify(client_state)
|
|
||||||
except binascii.Error:
|
|
||||||
logging.error(
|
|
||||||
"User {} has "
|
|
||||||
"invalid client state: {}".format(
|
|
||||||
uid, client_state
|
|
||||||
))
|
|
||||||
report.fail(uid, "bad client state")
|
|
||||||
continue
|
|
||||||
fxa_kid = self.format_key_id(
|
|
||||||
int(keys_changed_at or generation),
|
|
||||||
client_state
|
|
||||||
)
|
|
||||||
logging.debug("Adding user {} => {} , {}".format(
|
|
||||||
uid, fxa_uid, fxa_kid
|
|
||||||
))
|
|
||||||
output_file.write(
|
|
||||||
"{}\t{}\t{}\n".format(
|
|
||||||
uid, fxa_uid, fxa_kid))
|
|
||||||
success += 1
|
|
||||||
except Exception as ex:
|
|
||||||
logging.error(
|
|
||||||
"User {} Unexpected error".format(uid),
|
|
||||||
exc_info=ex)
|
|
||||||
report.fail(uid, "unexpected error")
|
|
||||||
except Exception as ex:
|
|
||||||
logging.critical("Error in fxa file around line {}".format(
|
|
||||||
line), exc_info=ex)
|
|
||||||
print("")
|
|
||||||
logging.info("Processed {} users, {} successful".format(line, success))
|
|
||||||
|
|
||||||
# The following two functions are taken from browserid.utils
|
|
||||||
def encode_bytes_b64(self, value):
|
|
||||||
return base64.urlsafe_b64encode(value).rstrip(b'=').decode('ascii')
|
|
||||||
|
|
||||||
def format_key_id(self, keys_changed_at, key_hash):
|
|
||||||
return "{:013d}-{}".format(
|
|
||||||
keys_changed_at,
|
|
||||||
self.encode_bytes_b64(key_hash),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
pid = os.getpid()
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Generate FxA user id info")
|
|
||||||
parser.add_argument(
|
|
||||||
'--users_file',
|
|
||||||
default="users.csv",
|
|
||||||
help="FXA User info in CSV format (default users.csv)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--output_file',
|
|
||||||
default="fxa_users_{}.lst".format(datetime.now().strftime("%Y_%m_%d")),
|
|
||||||
help="List of FxA users."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--verbose',
|
|
||||||
action="store_true",
|
|
||||||
help="verbose logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--quiet',
|
|
||||||
action="store_true",
|
|
||||||
help="silence logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--success_file', default="success_fxa_user.log".format(pid),
|
|
||||||
help="File of successfully migrated userids"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--failure_file', default="failure_fxa_user.log".format(pid),
|
|
||||||
help="File of unsuccessfully migrated userids"
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = get_args()
|
|
||||||
log_level = logging.INFO
|
|
||||||
if args.quiet:
|
|
||||||
log_level = logging.ERROR
|
|
||||||
if args.verbose:
|
|
||||||
log_level = logging.DEBUG
|
|
||||||
logging.basicConfig(
|
|
||||||
stream=sys.stdout,
|
|
||||||
level=log_level,
|
|
||||||
)
|
|
||||||
report = Report(args)
|
|
||||||
FxA_Generate(args, report)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,827 +0,0 @@
|
|||||||
#! venv/bin/python
|
|
||||||
|
|
||||||
# painfully stupid script to check out dumping mysql databases to avro.
|
|
||||||
# Avro is basically "JSON" for databases. It's not super complicated & it has
|
|
||||||
# issues (one of which is that it requires Python2).
|
|
||||||
#
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import base64
|
|
||||||
import binascii
|
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
import math
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import grpc
|
|
||||||
from mysql import connector
|
|
||||||
from google.cloud import spanner
|
|
||||||
from google.cloud.spanner_v1 import param_types
|
|
||||||
from google.api_core.exceptions import AlreadyExists, InvalidArgument
|
|
||||||
try:
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
except ImportError:
|
|
||||||
from urlparse import urlparse
|
|
||||||
|
|
||||||
META_GLOBAL_COLLECTION_NAME = "meta"
|
|
||||||
MAX_ROWS = 1500000
|
|
||||||
|
|
||||||
|
|
||||||
class BadDSNException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def tick(count):
|
|
||||||
mark = None
|
|
||||||
if not count % 100:
|
|
||||||
mark = "."
|
|
||||||
if not count % 1000:
|
|
||||||
mark = "|"
|
|
||||||
level = logging.getLogger().getEffectiveLevel()
|
|
||||||
if mark and level > logging.DEBUG:
|
|
||||||
print(mark, end='', flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
class Report:
|
|
||||||
|
|
||||||
bso = "init"
|
|
||||||
_success = None
|
|
||||||
_failure = None
|
|
||||||
|
|
||||||
def __init__(self, args):
|
|
||||||
self._success_file = args.success_file
|
|
||||||
self._failure_file = args.failure_file
|
|
||||||
|
|
||||||
def success(self, uid):
|
|
||||||
if not self._success:
|
|
||||||
self._success = open(self._success_file, "w")
|
|
||||||
self._success.write("{}\t{}\n".format(self.bso, uid))
|
|
||||||
|
|
||||||
def fail(self, uid, reason=None):
|
|
||||||
if not self._failure:
|
|
||||||
self._failure = open(self._failure_file, "w")
|
|
||||||
logging.debug("Skipping user {}".format(uid))
|
|
||||||
self._failure.write("{}\t{}\t{}\n".format(self.bso, uid, reason or ""))
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self._success.close()
|
|
||||||
self._failure.close()
|
|
||||||
|
|
||||||
def read_failure(self, input):
|
|
||||||
start = 19
|
|
||||||
end = 0
|
|
||||||
users = []
|
|
||||||
for line in open(input).readlines():
|
|
||||||
line = line.strip()
|
|
||||||
if line[0] in "#;/":
|
|
||||||
continue
|
|
||||||
(bso, user, reason) = line.split("\t")
|
|
||||||
start = min(start, int(bso))
|
|
||||||
end = max(end, int(bso))
|
|
||||||
users.append(user)
|
|
||||||
return (int(start), int(end), users)
|
|
||||||
|
|
||||||
class FXA_info:
|
|
||||||
"""User information from Tokenserver database.
|
|
||||||
|
|
||||||
Can be constructed from
|
|
||||||
``mysql -e "select uid, email, generation, keys_changed_at, \
|
|
||||||
client_state from users;" > users.csv`
|
|
||||||
"""
|
|
||||||
users = {}
|
|
||||||
anon = False
|
|
||||||
|
|
||||||
def __init__(self, users_file, args, report):
|
|
||||||
if args.anon:
|
|
||||||
self.anon = True
|
|
||||||
return
|
|
||||||
logging.info("Reading users file: {}".format(users_file))
|
|
||||||
if not os.path.isfile(users_file):
|
|
||||||
raise IOError("{} not found".format(users_file))
|
|
||||||
with open(users_file) as csv_file:
|
|
||||||
try:
|
|
||||||
line = 0
|
|
||||||
for (uid, fxa_uid, fxa_kid) in csv.reader(
|
|
||||||
csv_file, delimiter="\t"):
|
|
||||||
line += 1
|
|
||||||
tick(line)
|
|
||||||
if uid == 'uid':
|
|
||||||
# skip the header row.
|
|
||||||
continue
|
|
||||||
if args.user:
|
|
||||||
if int(uid) not in args.user:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
self.users[int(uid)] = (fxa_kid, fxa_uid)
|
|
||||||
except Exception as ex:
|
|
||||||
logging.error(
|
|
||||||
"User {} Unexpected error".format(uid),
|
|
||||||
exc_info=ex)
|
|
||||||
report.fail(uid, "unexpected error")
|
|
||||||
except Exception as ex:
|
|
||||||
logging.critical("Error in fxa file around line {}".format(
|
|
||||||
line), exc_info=ex)
|
|
||||||
|
|
||||||
def get(self, userid):
|
|
||||||
if userid in self.users:
|
|
||||||
return self.users[userid]
|
|
||||||
if self.anon:
|
|
||||||
fxa_uid = "fake_" + binascii.hexlify(
|
|
||||||
os.urandom(11)).decode('utf-8')
|
|
||||||
fxa_kid = "fake_" + binascii.hexlify(
|
|
||||||
os.urandom(11)).decode('utf-8')
|
|
||||||
self.users[userid] = (fxa_kid, fxa_uid)
|
|
||||||
return (fxa_kid, fxa_uid)
|
|
||||||
|
|
||||||
|
|
||||||
class Collections:
|
|
||||||
"""Cache spanner collection list.
|
|
||||||
|
|
||||||
The spanner collection list is the (soon to be) single source of
|
|
||||||
truth regarding collection ids.
|
|
||||||
|
|
||||||
"""
|
|
||||||
_by_name = {
|
|
||||||
"clients": 1,
|
|
||||||
"crypto": 2,
|
|
||||||
"forms": 3,
|
|
||||||
"history": 4,
|
|
||||||
"keys": 5,
|
|
||||||
"meta": 6,
|
|
||||||
"bookmarks": 7,
|
|
||||||
"prefs": 8,
|
|
||||||
"tabs": 9,
|
|
||||||
"passwords": 10,
|
|
||||||
"addons": 11,
|
|
||||||
"addresses": 12,
|
|
||||||
"creditcards": 13,
|
|
||||||
"reserved": 100,
|
|
||||||
}
|
|
||||||
spanner = None
|
|
||||||
|
|
||||||
def __init__(self, databases):
|
|
||||||
"""merge the mysql user_collections into spanner"""
|
|
||||||
sql = """
|
|
||||||
SELECT
|
|
||||||
DISTINCT uc.collection, cc.name
|
|
||||||
FROM
|
|
||||||
user_collections as uc,
|
|
||||||
collections as cc
|
|
||||||
WHERE
|
|
||||||
uc.collection = cc.collectionid
|
|
||||||
ORDER BY
|
|
||||||
uc.collection
|
|
||||||
"""
|
|
||||||
cursor = databases['mysql'].cursor()
|
|
||||||
|
|
||||||
def transact(transaction, values):
|
|
||||||
transaction.insert(
|
|
||||||
'collections',
|
|
||||||
columns=('collection_id', 'name'),
|
|
||||||
values=values)
|
|
||||||
|
|
||||||
self.spanner = databases['spanner']
|
|
||||||
try:
|
|
||||||
# fetch existing:
|
|
||||||
with self.spanner.snapshot() as scursor:
|
|
||||||
rows = scursor.execute_sql(
|
|
||||||
"select collection_id, name from collections")
|
|
||||||
for (collection_id, name) in rows:
|
|
||||||
logging.debug("Loading collection: {} => {}".format(
|
|
||||||
name, collection_id
|
|
||||||
))
|
|
||||||
self._by_name[name] = collection_id
|
|
||||||
cursor.execute(sql)
|
|
||||||
for (collection_id, name) in cursor:
|
|
||||||
if name not in self._by_name:
|
|
||||||
logging.debug("Adding collection: {} => {}".format(
|
|
||||||
name, collection_id
|
|
||||||
))
|
|
||||||
values = [(collection_id, name)]
|
|
||||||
self._by_name[name] = collection_id
|
|
||||||
# Since a collection may collide, do these one at a time.
|
|
||||||
try:
|
|
||||||
self.spanner.run_in_transaction(transact, values)
|
|
||||||
except AlreadyExists:
|
|
||||||
logging.info(
|
|
||||||
"Skipping already present collection {}".format(
|
|
||||||
values
|
|
||||||
))
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
def get(self, name, collection_id=None):
|
|
||||||
"""Fetches the collection_id"""
|
|
||||||
|
|
||||||
id = self._by_name.get(name)
|
|
||||||
if id is None:
|
|
||||||
logging.warn(
|
|
||||||
"Unknown collection {}:{} encountered!".format(
|
|
||||||
name, collection_id))
|
|
||||||
# it would be swell to add these to the collection table,
|
|
||||||
# but that would mean
|
|
||||||
# an imbedded spanner transaction, and that's not allowed.
|
|
||||||
return None
|
|
||||||
return id
|
|
||||||
|
|
||||||
|
|
||||||
def conf_mysql(dsn):
|
|
||||||
"""create a connection to the original storage system """
|
|
||||||
logging.debug("Configuring MYSQL: {}".format(dsn))
|
|
||||||
connection = connector.connect(
|
|
||||||
user=dsn.username,
|
|
||||||
password=dsn.password,
|
|
||||||
host=dsn.hostname,
|
|
||||||
port=dsn.port or 3306,
|
|
||||||
database=dsn.path[1:],
|
|
||||||
auth_plugin="mysql_native_password"
|
|
||||||
)
|
|
||||||
return connection
|
|
||||||
|
|
||||||
|
|
||||||
def conf_spanner(dsn):
|
|
||||||
"""create a connection to the new Spanner system"""
|
|
||||||
logging.debug("Configuring SPANNER: {}".format(dsn))
|
|
||||||
path = dsn.path.split("/")
|
|
||||||
instance_id = path[-3]
|
|
||||||
database_id = path[-1]
|
|
||||||
client = spanner.Client()
|
|
||||||
instance = client.instance(instance_id)
|
|
||||||
database = instance.database(database_id)
|
|
||||||
return database
|
|
||||||
|
|
||||||
|
|
||||||
def conf_db(dsn):
|
|
||||||
"""read the list of storage definitions from the file and create
|
|
||||||
a set of connetions.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if "mysql" in dsn.scheme:
|
|
||||||
return conf_mysql(dsn)
|
|
||||||
if "spanner" in dsn.scheme:
|
|
||||||
return conf_spanner(dsn)
|
|
||||||
raise RuntimeError("Unknown DSN type: {}".format(dsn.scheme))
|
|
||||||
|
|
||||||
|
|
||||||
def dumper(columns, values):
|
|
||||||
"""verbose column and data dumper. """
|
|
||||||
result = ""
|
|
||||||
for row in values:
|
|
||||||
for i in range(0, len(columns)):
|
|
||||||
result += " {} => {}\n".format(columns[i], row[i])
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def newSyncID():
|
|
||||||
base64.urlsafe_b64encode(os.urandom(9))
|
|
||||||
|
|
||||||
|
|
||||||
def alter_syncids(pay):
|
|
||||||
"""Alter the syncIDs for the meta/global record, which will cause a sync
|
|
||||||
when the client reconnects
|
|
||||||
|
|
||||||
"""
|
|
||||||
payload = json.loads(pay)
|
|
||||||
payload['syncID'] = newSyncID()
|
|
||||||
for item in payload['engines']:
|
|
||||||
payload['engines'][item]['syncID'] = newSyncID()
|
|
||||||
return json.dumps(payload)
|
|
||||||
|
|
||||||
|
|
||||||
def divvy(biglist, count):
|
|
||||||
"""Partition a list into a set of equally sized slices"""
|
|
||||||
lists = []
|
|
||||||
biglen = len(biglist)
|
|
||||||
start = 0
|
|
||||||
while start < biglen:
|
|
||||||
lists.append(biglist[start:min(start+count, biglen)])
|
|
||||||
start += count
|
|
||||||
return lists
|
|
||||||
|
|
||||||
|
|
||||||
def move_user(databases, user_data, collections, fxa, bso_num, args, report):
|
|
||||||
"""copy user info from original storage to new storage."""
|
|
||||||
# bso column mapping:
|
|
||||||
# id => bso_id
|
|
||||||
# collection => collection_id
|
|
||||||
# sortindex => sortindex
|
|
||||||
# modified => modified
|
|
||||||
# payload => payload
|
|
||||||
# payload_size => NONE
|
|
||||||
# ttl => expiry
|
|
||||||
|
|
||||||
uc_columns = (
|
|
||||||
'fxa_uid',
|
|
||||||
'fxa_kid',
|
|
||||||
'collection_id',
|
|
||||||
'modified',
|
|
||||||
)
|
|
||||||
bso_columns = (
|
|
||||||
'collection_id',
|
|
||||||
'fxa_uid',
|
|
||||||
'fxa_kid',
|
|
||||||
'bso_id',
|
|
||||||
'expiry',
|
|
||||||
'modified',
|
|
||||||
'payload',
|
|
||||||
'sortindex',
|
|
||||||
)
|
|
||||||
|
|
||||||
(uid, fxa_uid, fxa_kid) = user_data
|
|
||||||
# Fetch the BSO data from the original storage.
|
|
||||||
sql = """
|
|
||||||
SELECT
|
|
||||||
collections.name, bso.collection, uc.last_modified,
|
|
||||||
bso.id, bso.ttl, bso.modified, bso.payload, bso.sortindex
|
|
||||||
FROM
|
|
||||||
bso{} as bso,
|
|
||||||
collections,
|
|
||||||
user_collections as uc
|
|
||||||
WHERE
|
|
||||||
bso.userid = %s
|
|
||||||
and collections.collectionid = bso.collection
|
|
||||||
and uc.collection = bso.collection
|
|
||||||
and uc.userid = bso.userid
|
|
||||||
and bso.ttl > unix_timestamp()
|
|
||||||
ORDER BY
|
|
||||||
bso.collection, bso.id""".format(bso_num)
|
|
||||||
unique_key_filter = set()
|
|
||||||
|
|
||||||
def spanner_transact_wipe_user(
|
|
||||||
transaction, fxa_uid, fxa_kid, args):
|
|
||||||
result = transaction.execute_sql(
|
|
||||||
"""
|
|
||||||
SELECT
|
|
||||||
uc.collection_id, c.name
|
|
||||||
FROM
|
|
||||||
user_collections as uc
|
|
||||||
LEFT JOIN
|
|
||||||
collections as c
|
|
||||||
ON
|
|
||||||
uc.collection_id = c.collection_id
|
|
||||||
WHERE
|
|
||||||
uc.fxa_uid = @fxa_uid
|
|
||||||
AND uc.fxa_kid = @fxa_kid
|
|
||||||
""",
|
|
||||||
params=dict(fxa_uid=fxa_uid, fxa_kid=fxa_kid),
|
|
||||||
param_types=dict(fxa_uid=param_types.STRING, fxa_kid=param_types.STRING),
|
|
||||||
)
|
|
||||||
cols = [(row[0], row[1]) for row in result]
|
|
||||||
if not args.dryrun:
|
|
||||||
logging.debug("Wiping user, collections: {}".format(cols))
|
|
||||||
transaction.execute_update(
|
|
||||||
"""
|
|
||||||
DELETE FROM
|
|
||||||
user_collections
|
|
||||||
WHERE
|
|
||||||
fxa_uid = @fxa_uid
|
|
||||||
AND fxa_kid = @fxa_kid
|
|
||||||
""",
|
|
||||||
params=dict(fxa_uid=fxa_uid, fxa_kid=fxa_kid),
|
|
||||||
param_types=dict(fxa_uid=param_types.STRING, fxa_kid=param_types.STRING),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logging.debug("Not wiping user, collections: {}".format(cols))
|
|
||||||
|
|
||||||
def spanner_transact_uc(
|
|
||||||
transaction, data, fxa_uid, fxa_kid, args):
|
|
||||||
# user collections require a unique key.
|
|
||||||
for (col, cid, cmod, bid, exp, bmod, pay, sid) in data:
|
|
||||||
collection_id = collections.get(col, cid)
|
|
||||||
if collection_id is None:
|
|
||||||
continue
|
|
||||||
# columns from sync_schema3
|
|
||||||
# user_collections modified should come directly from
|
|
||||||
# mysql user_collections.last_modified
|
|
||||||
mod_v = datetime.utcfromtimestamp(cmod/1000.0)
|
|
||||||
# User_Collection can only have unique values. Filter
|
|
||||||
# non-unique keys and take the most recent modified
|
|
||||||
# time. The join could be anything.
|
|
||||||
uc_key = "{}_{}_{}".format(fxa_uid, fxa_kid, col)
|
|
||||||
if uc_key not in unique_key_filter:
|
|
||||||
uc_values = [(
|
|
||||||
fxa_uid,
|
|
||||||
fxa_kid,
|
|
||||||
collection_id,
|
|
||||||
mod_v,
|
|
||||||
)]
|
|
||||||
if not args.dryrun:
|
|
||||||
transaction.insert(
|
|
||||||
'user_collections',
|
|
||||||
columns=uc_columns,
|
|
||||||
values=uc_values
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logging.debug("not writing {} => {}".format(
|
|
||||||
uc_columns, uc_values))
|
|
||||||
unique_key_filter.add(uc_key)
|
|
||||||
|
|
||||||
def spanner_transact_bso(transaction, data, fxa_uid, fxa_kid, args):
|
|
||||||
count = 0
|
|
||||||
bso_values = []
|
|
||||||
for (col, cid, cmod, bid, exp, bmod, pay, sid) in data:
|
|
||||||
collection_id = collections.get(col, cid)
|
|
||||||
if collection_id is None:
|
|
||||||
continue
|
|
||||||
if collection_id != cid:
|
|
||||||
logging.debug(
|
|
||||||
"Remapping collection '{}' from {} to {}".format(
|
|
||||||
col, cid, collection_id))
|
|
||||||
# columns from sync_schema3
|
|
||||||
mod_v = datetime.utcfromtimestamp(bmod/1000.0)
|
|
||||||
exp_v = datetime.utcfromtimestamp(exp)
|
|
||||||
|
|
||||||
# add the BSO values.
|
|
||||||
if args.full and col == META_GLOBAL_COLLECTION_NAME:
|
|
||||||
pay = alter_syncids(pay)
|
|
||||||
bso_values.append([
|
|
||||||
collection_id,
|
|
||||||
fxa_uid,
|
|
||||||
fxa_kid,
|
|
||||||
bid,
|
|
||||||
exp_v,
|
|
||||||
mod_v,
|
|
||||||
pay,
|
|
||||||
sid,
|
|
||||||
])
|
|
||||||
|
|
||||||
count += 1
|
|
||||||
if not args.dryrun:
|
|
||||||
logging.debug(
|
|
||||||
"###bso{} {}".format(
|
|
||||||
bso_num,
|
|
||||||
dumper(bso_columns, bso_values)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for i in range(0, 5):
|
|
||||||
try:
|
|
||||||
transaction.insert(
|
|
||||||
'bsos',
|
|
||||||
columns=bso_columns,
|
|
||||||
values=bso_values
|
|
||||||
)
|
|
||||||
break
|
|
||||||
except grpc._channel_._InactiveRpcError as ex:
|
|
||||||
logging.warn(
|
|
||||||
"Could not write record (attempt {})".format(i),
|
|
||||||
exc_info=ex)
|
|
||||||
time.sleep(.5)
|
|
||||||
else:
|
|
||||||
logging.debug("not writing {} => {}".format(
|
|
||||||
bso_columns, bso_values))
|
|
||||||
return count
|
|
||||||
|
|
||||||
cursor = databases['mysql'].cursor()
|
|
||||||
count = 0
|
|
||||||
try:
|
|
||||||
# Note: cursor() does not support __enter__()
|
|
||||||
logging.info("Processing... {} -> {}:{}".format(
|
|
||||||
uid, fxa_uid, fxa_kid))
|
|
||||||
cursor.execute(sql, (uid,))
|
|
||||||
data = []
|
|
||||||
abort_col = None
|
|
||||||
abort_count = None
|
|
||||||
col_count = 0
|
|
||||||
|
|
||||||
if args.abort:
|
|
||||||
(abort_col, abort_count) = args.abort.split(":")
|
|
||||||
abort_count = int(abort_count)
|
|
||||||
for row in cursor:
|
|
||||||
logging.debug("col: {}".format(row[0]))
|
|
||||||
if abort_col and int(row[1]) == int(abort_col):
|
|
||||||
col_count += 1
|
|
||||||
if col_count > abort_count:
|
|
||||||
logging.debug("Skipping col: {}: {} of {}".format(
|
|
||||||
row[0], col_count, abort_count))
|
|
||||||
continue
|
|
||||||
data.append(row)
|
|
||||||
if args.abort:
|
|
||||||
logging.info("Skipped {} of {} rows for {}".format(
|
|
||||||
abort_count, col_count, abort_col
|
|
||||||
))
|
|
||||||
logging.info(
|
|
||||||
"Moving {} items for user {} => {}:{}".format(
|
|
||||||
len(data), uid, fxa_uid, fxa_kid))
|
|
||||||
|
|
||||||
if args.wipe_user:
|
|
||||||
databases['spanner'].run_in_transaction(
|
|
||||||
spanner_transact_wipe_user,
|
|
||||||
fxa_uid,
|
|
||||||
fxa_kid,
|
|
||||||
args,
|
|
||||||
)
|
|
||||||
|
|
||||||
for bunch in divvy(data, args.chunk or 1000):
|
|
||||||
# Occasionally, there is a batch fail because a
|
|
||||||
# user collection is not found before a bso is written.
|
|
||||||
# to solve that, divide the UC updates from the
|
|
||||||
# BSO updates.
|
|
||||||
# Run through the list of UserCollection updates
|
|
||||||
databases['spanner'].run_in_transaction(
|
|
||||||
spanner_transact_uc,
|
|
||||||
bunch,
|
|
||||||
fxa_uid,
|
|
||||||
fxa_kid,
|
|
||||||
args,
|
|
||||||
)
|
|
||||||
count += databases['spanner'].run_in_transaction(
|
|
||||||
spanner_transact_bso,
|
|
||||||
bunch,
|
|
||||||
fxa_uid,
|
|
||||||
fxa_kid,
|
|
||||||
args,
|
|
||||||
)
|
|
||||||
if args.ms_delay > 0:
|
|
||||||
logging.debug(
|
|
||||||
"Sleeping for {} seconds".format(args.ms_delay * .01))
|
|
||||||
time.sleep(args.ms_delay * .01)
|
|
||||||
|
|
||||||
except AlreadyExists:
|
|
||||||
logging.warn(
|
|
||||||
"User {} already imported fxa_uid:{} / fxa_kid:{}".format(
|
|
||||||
uid, fxa_uid, fxa_kid
|
|
||||||
))
|
|
||||||
report.fail(uid, "exists")
|
|
||||||
return count
|
|
||||||
except InvalidArgument as ex:
|
|
||||||
report.fail(uid, "exists")
|
|
||||||
if "already inserted" in ex.args[0]:
|
|
||||||
logging.warn(
|
|
||||||
"User {} already imported fxa_uid:{} / fxa_kid:{}".format(
|
|
||||||
uid, fxa_uid, fxa_kid
|
|
||||||
))
|
|
||||||
return count
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
except Exception as ex:
|
|
||||||
report.fail(uid, "unexpected batch error")
|
|
||||||
logging.error("Unexpected Batch failure: {}:{}".format(
|
|
||||||
fxa_uid, fxa_kid), exc_info=ex)
|
|
||||||
finally:
|
|
||||||
# cursor may complain about unread data, this should prevent
|
|
||||||
# that warning.
|
|
||||||
for result in cursor:
|
|
||||||
pass
|
|
||||||
cursor.close()
|
|
||||||
report.success(uid)
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def get_percentage_users(users, user_percent):
|
|
||||||
(block, percentage) = map(
|
|
||||||
int, user_percent.split(':'))
|
|
||||||
total_count = len(users)
|
|
||||||
chunk_size = max(
|
|
||||||
1, math.floor(
|
|
||||||
total_count * (int(percentage) * .01)))
|
|
||||||
chunk_count = math.ceil(total_count / chunk_size)
|
|
||||||
chunk_start = max(block - 1, 0) * chunk_size
|
|
||||||
chunk_end = min(chunk_count, block) * chunk_size
|
|
||||||
if chunk_size * chunk_count > total_count:
|
|
||||||
if block >= chunk_count - 1:
|
|
||||||
chunk_end = total_count
|
|
||||||
users = users[chunk_start:chunk_end]
|
|
||||||
logging.debug(
|
|
||||||
"moving users: {} to {}".format(
|
|
||||||
chunk_start, chunk_end))
|
|
||||||
return users
|
|
||||||
|
|
||||||
|
|
||||||
def get_users(args, databases, fxa, bso_num, report):
|
|
||||||
"""Fetch the user information from the Tokenserver Dump """
|
|
||||||
users = []
|
|
||||||
try:
|
|
||||||
if args.user:
|
|
||||||
for uid in args.user:
|
|
||||||
try:
|
|
||||||
(fxa_kid, fxa_uid) = fxa.get(uid)
|
|
||||||
users.append((uid, fxa_uid, fxa_kid))
|
|
||||||
except TypeError:
|
|
||||||
logging.error(
|
|
||||||
"User {} not found in "
|
|
||||||
"tokenserver data.".format(uid))
|
|
||||||
report.fail(uid, "not found")
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
bso_users_file = args.bso_users_file.replace('#', str(bso_num))
|
|
||||||
with open(bso_users_file) as bso_file:
|
|
||||||
line = 0
|
|
||||||
for row in csv.reader(
|
|
||||||
bso_file, delimiter="\t"
|
|
||||||
):
|
|
||||||
if row[0] == "uid":
|
|
||||||
continue
|
|
||||||
users.append(row)
|
|
||||||
tick(line)
|
|
||||||
line += 1
|
|
||||||
except Exception as ex:
|
|
||||||
logging.critical("Error reading BSO data", exc_info=ex)
|
|
||||||
exit(-1)
|
|
||||||
if args.user_percent:
|
|
||||||
users = get_percentage_users(users, args.user_percent)
|
|
||||||
except Exception as ex:
|
|
||||||
logging.critical("Unexpected Error moving database:", exc_info=ex)
|
|
||||||
exit(-1)
|
|
||||||
return users
|
|
||||||
|
|
||||||
|
|
||||||
def move_database(databases, collections, bso_num, fxa, args, report):
|
|
||||||
"""iterate over provided users and move their data from old to new"""
|
|
||||||
start = time.time()
|
|
||||||
# off chance that someone else might have written
|
|
||||||
# a new collection table since the last time we
|
|
||||||
# fetched.
|
|
||||||
rows = 0
|
|
||||||
users = get_users(args, databases, fxa, bso_num, report)
|
|
||||||
logging.info("Moving {} users".format(len(users)))
|
|
||||||
for user in users:
|
|
||||||
rows += move_user(
|
|
||||||
databases=databases,
|
|
||||||
user_data=user,
|
|
||||||
collections=collections,
|
|
||||||
fxa=fxa,
|
|
||||||
bso_num=bso_num,
|
|
||||||
args=args,
|
|
||||||
report=report)
|
|
||||||
logging.info("Finished BSO #{} ({} rows) in {} seconds".format(
|
|
||||||
bso_num,
|
|
||||||
rows,
|
|
||||||
math.ceil(time.time() - start)
|
|
||||||
))
|
|
||||||
return rows
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
pid = os.getpid()
|
|
||||||
today = datetime.now().strftime("%Y_%m_%d")
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="move user from sql to spanner")
|
|
||||||
parser.add_argument(
|
|
||||||
'--dsns', default="move_dsns.lst",
|
|
||||||
help="file of new line separated DSNs")
|
|
||||||
parser.add_argument(
|
|
||||||
'--verbose',
|
|
||||||
action="store_true",
|
|
||||||
help="verbose logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--quiet',
|
|
||||||
action="store_true",
|
|
||||||
help="silence logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--offset', type=int, default=0,
|
|
||||||
help="UID to start at (default 0)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--full",
|
|
||||||
action="store_true",
|
|
||||||
help="force a full reconcile"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--anon', action='store_true',
|
|
||||||
help="Anonymize the user data"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--start_bso', default=0,
|
|
||||||
type=int,
|
|
||||||
help="start dumping BSO database (default: 0)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--end_bso',
|
|
||||||
type=int, default=19,
|
|
||||||
help="last BSO database to dump (default: 19)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--bso_num',
|
|
||||||
type=int,
|
|
||||||
help="only move this bso (equivalent to start_bso == end_bso)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--write_chunk',
|
|
||||||
dest="chunk",
|
|
||||||
default=1666,
|
|
||||||
help="how many rows per transaction for spanner (default: 1666)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--user',
|
|
||||||
type=str,
|
|
||||||
help="BSO#:userId[,userid,...] to move."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--retry_file',
|
|
||||||
type=str,
|
|
||||||
help="Copy of failure file to read user IDs to retry."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--wipe_user',
|
|
||||||
action="store_true",
|
|
||||||
help="delete any pre-existing --user data on spanner before the migration"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--bso_users_file',
|
|
||||||
default="bso_users_#_{}.lst".format(today),
|
|
||||||
help="name of the generated BSO user file. "
|
|
||||||
"(Will use bso number for `#` if present; "
|
|
||||||
"default: bso_users_#_{}.lst)".format(today),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--fxa_users_file',
|
|
||||||
default="fxa_users_{}.lst".format(today),
|
|
||||||
help="List of pre-generated FxA users. Only needed if specifying"
|
|
||||||
" the `--user` option; default: fxa_users_{}.lst)".format(today)
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--dryrun',
|
|
||||||
action="store_true",
|
|
||||||
help="Do not write user records to spanner"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--abort',
|
|
||||||
type=str,
|
|
||||||
help="abort data in col after #rows (e.g. history:10)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--user_percent", default="1:100",
|
|
||||||
help=("Offset and percent of users from this BSO"
|
|
||||||
"to move (e.g. 2:50 moves the second 50%%) "
|
|
||||||
"(default 1:100)")
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--ms_delay', type=int, default=0,
|
|
||||||
help="inject a sleep between writes to spanner as a throttle"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--success_file', default="success_{}.log".format(pid),
|
|
||||||
help="File of successfully migrated userids"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--failure_file', default="failure_{}.log".format(pid),
|
|
||||||
help="File of unsuccessfully migrated userids"
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = get_args()
|
|
||||||
log_level = logging.INFO
|
|
||||||
if args.quiet:
|
|
||||||
log_level = logging.ERROR
|
|
||||||
if args.verbose:
|
|
||||||
log_level = logging.DEBUG
|
|
||||||
logging.basicConfig(
|
|
||||||
stream=sys.stdout,
|
|
||||||
level=log_level,
|
|
||||||
)
|
|
||||||
report = Report(args)
|
|
||||||
dsns = open(args.dsns).readlines()
|
|
||||||
databases = {}
|
|
||||||
rows = 0
|
|
||||||
|
|
||||||
if args.user:
|
|
||||||
args.user_percent = "1:100"
|
|
||||||
(bso, userid) = args.user.split(':')
|
|
||||||
args.start_bso = int(bso)
|
|
||||||
args.end_bso = int(bso)
|
|
||||||
user_list = []
|
|
||||||
for id in userid.split(','):
|
|
||||||
user_list.append(int(id))
|
|
||||||
args.user = user_list
|
|
||||||
elif args.wipe_user:
|
|
||||||
raise RuntimeError("--wipe_user requires --user")
|
|
||||||
if args.retry_file:
|
|
||||||
(args.start_bso, args.end_bso, args.user) = report.read_failure(
|
|
||||||
args.retry_file)
|
|
||||||
if args.bso_num is not None:
|
|
||||||
args.start_bso = args.end_bso = args.bso_num
|
|
||||||
for line in dsns:
|
|
||||||
dsn = urlparse(line.strip())
|
|
||||||
scheme = dsn.scheme
|
|
||||||
if 'mysql' in dsn.scheme:
|
|
||||||
scheme = 'mysql'
|
|
||||||
databases[scheme] = conf_db(dsn)
|
|
||||||
if not databases.get('mysql') or not databases.get('spanner'):
|
|
||||||
raise RuntimeError("Both mysql and spanner dsns must be specified")
|
|
||||||
fxa_info = FXA_info(args.fxa_users_file, args, report)
|
|
||||||
collections = Collections(databases)
|
|
||||||
logging.info("Starting:")
|
|
||||||
if args.dryrun:
|
|
||||||
logging.info("=== DRY RUN MODE ===")
|
|
||||||
start = time.time()
|
|
||||||
for bso_num in range(args.start_bso, args.end_bso+1):
|
|
||||||
logging.info("Moving users in bso # {}".format(bso_num))
|
|
||||||
report.bso = bso_num
|
|
||||||
rows += move_database(
|
|
||||||
databases, collections, bso_num, fxa_info, args, report)
|
|
||||||
logging.info(
|
|
||||||
"Moved: {} rows in {} seconds".format(
|
|
||||||
rows or 0, time.time() - start))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,111 +0,0 @@
|
|||||||
#! venv/bin/python
|
|
||||||
|
|
||||||
# painfully stupid script to check out dumping a spanner database to avro.
|
|
||||||
# Avro is basically "JSON" for databases. It's not super complicated & it has
|
|
||||||
# issues (one of which is that it requires Python2).
|
|
||||||
# test run Dumped 2770783 rows in 457.566066027 seconds and produced a
|
|
||||||
# roughly 6.5GB file.
|
|
||||||
#
|
|
||||||
# Spanner also has a Deadline issue where it will kill a db connection after
|
|
||||||
# so many minutes (5?). Might be better to just divvy things up into clusters
|
|
||||||
# and have threads handle transporting records over.
|
|
||||||
#
|
|
||||||
|
|
||||||
import avro.schema
|
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
from avro.datafile import DataFileWriter
|
|
||||||
from avro.io import DatumWriter
|
|
||||||
from google.cloud import spanner
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
parser = argparse.ArgumentParser(description="dump spanner to arvo files")
|
|
||||||
parser.add_argument(
|
|
||||||
'--instance_id', default="spanner-test",
|
|
||||||
help="Spanner instance name")
|
|
||||||
parser.add_argument(
|
|
||||||
'--database_id', default="sync_schema3",
|
|
||||||
help="Spanner database name")
|
|
||||||
parser.add_argument(
|
|
||||||
'--schema', default="sync.avsc",
|
|
||||||
help="Database schema description")
|
|
||||||
parser.add_argument(
|
|
||||||
'--output', default="output.avso",
|
|
||||||
help="Output file")
|
|
||||||
parser.add_argument(
|
|
||||||
'--limit', type=int, default=1500000,
|
|
||||||
help="Limit to n rows")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def conf_spanner(args):
|
|
||||||
spanner_client = spanner.Client()
|
|
||||||
instance = spanner_client.instance(args.instance_id)
|
|
||||||
database = instance.database(args.database_id)
|
|
||||||
return database
|
|
||||||
|
|
||||||
|
|
||||||
def dump_rows(offset, db, writer, args):
|
|
||||||
print("Querying.... @{}".format(offset))
|
|
||||||
sql = """
|
|
||||||
SELECT collection_id, fxa_kid, fxa_uid, bso_id,
|
|
||||||
UNIX_MICROS(expiry), UNIX_MICROS(modified), payload,
|
|
||||||
sortindex from bsos LIMIT {} OFFSET {}""".format(args.limit, offset)
|
|
||||||
try:
|
|
||||||
with db.snapshot() as snapshot:
|
|
||||||
result = snapshot.execute_sql(sql)
|
|
||||||
print("Dumping...")
|
|
||||||
for row in result:
|
|
||||||
writer.append({
|
|
||||||
"collection_id": row[0],
|
|
||||||
"fxa_kid": row[1],
|
|
||||||
"fxa_uid": row[2],
|
|
||||||
"bso_id": row[3],
|
|
||||||
"expiry": row[4],
|
|
||||||
"modified": row[5],
|
|
||||||
"payload": row[6],
|
|
||||||
"sortindex": row[7]})
|
|
||||||
offset += 1
|
|
||||||
if offset % 1000 == 0:
|
|
||||||
print("Row: {}".format(offset))
|
|
||||||
return offset
|
|
||||||
except Exception as ex:
|
|
||||||
print("Deadline hit at: {} ({})".format(offset, ex))
|
|
||||||
return offset
|
|
||||||
|
|
||||||
|
|
||||||
def count_rows(db):
|
|
||||||
with db.snapshot() as snapshot:
|
|
||||||
result = snapshot.execute_sql("SELECT Count(*) from bsos")
|
|
||||||
return result.one()[0]
|
|
||||||
|
|
||||||
|
|
||||||
def dump_data(args, schema):
|
|
||||||
offset = 0
|
|
||||||
# things time out around 1_500_000 rows.
|
|
||||||
db = conf_spanner(args)
|
|
||||||
writer = DataFileWriter(
|
|
||||||
open(args.output, "wb"), DatumWriter(), schema)
|
|
||||||
row_count = count_rows(db)
|
|
||||||
print("Dumping {} rows".format(row_count))
|
|
||||||
while offset < row_count:
|
|
||||||
old_offset = offset
|
|
||||||
offset = dump_rows(offset=offset, db=db, writer=writer, args=args)
|
|
||||||
if offset == old_offset:
|
|
||||||
break
|
|
||||||
writer.close()
|
|
||||||
return row_count
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
start = time.time()
|
|
||||||
args = get_args()
|
|
||||||
schema = avro.schema.parse(open(args.schema, "rb").read())
|
|
||||||
rows = dump_data(args, schema)
|
|
||||||
print("Dumped: {} rows in {} seconds".format(rows, time.time() - start))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,312 +0,0 @@
|
|||||||
#! venv/bin/python
|
|
||||||
|
|
||||||
# This file is historical.
|
|
||||||
# We're using `migrate_node.py`, however this file may be useful in the future
|
|
||||||
# if we determine there's a problem with directly transcribing the data from
|
|
||||||
# mysql to spanner.
|
|
||||||
#
|
|
||||||
# painfully stupid script to check out dumping mysql databases to avro.
|
|
||||||
# Avro is basically "JSON" for databases. It's not super complicated & it has
|
|
||||||
# issues.
|
|
||||||
#
|
|
||||||
|
|
||||||
import avro.schema
|
|
||||||
import argparse
|
|
||||||
import binascii
|
|
||||||
import csv
|
|
||||||
import base64
|
|
||||||
import math
|
|
||||||
import time
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import re
|
|
||||||
|
|
||||||
from avro.datafile import DataFileWriter
|
|
||||||
from avro.io import DatumWriter
|
|
||||||
from mysql import connector
|
|
||||||
try:
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
except:
|
|
||||||
from urlparse import urlparse
|
|
||||||
|
|
||||||
|
|
||||||
MAX_ROWS=1500000
|
|
||||||
|
|
||||||
class BadDSNException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
parser = argparse.ArgumentParser(description="dump spanner to arvo files")
|
|
||||||
parser.add_argument(
|
|
||||||
'--dsns', default="dsns.lst",
|
|
||||||
help="file of new line separated DSNs")
|
|
||||||
parser.add_argument(
|
|
||||||
'--schema', default="sync.avsc",
|
|
||||||
help="Database schema description")
|
|
||||||
parser.add_argument(
|
|
||||||
'--col_schema', default="user_collection.avsc",
|
|
||||||
help="User Collection schema description"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--output', default="output.avso",
|
|
||||||
help="Output file")
|
|
||||||
parser.add_argument(
|
|
||||||
'--limit', type=int, default=1500000,
|
|
||||||
help="Limit each read chunk to n rows")
|
|
||||||
parser.add_argument(
|
|
||||||
'--offset', type=int, default=0,
|
|
||||||
help="UID to start at")
|
|
||||||
parser.add_argument(
|
|
||||||
'--deanon', action='store_false',
|
|
||||||
dest='anon',
|
|
||||||
help="Anonymize the user data"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--start_bso', default=0,
|
|
||||||
type=int,
|
|
||||||
help="start dumping BSO database"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--end_bso',
|
|
||||||
type=int, default=19,
|
|
||||||
help="last BSO database to dump"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--token_file',
|
|
||||||
default='users.csv',
|
|
||||||
help="token user database dump CSV"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--skip_collections', action='store_false',
|
|
||||||
help="skip user_collections table"
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def conf_db(dsn):
|
|
||||||
dsn = urlparse(dsn)
|
|
||||||
"""
|
|
||||||
if dsn.scheme != "mysql":
|
|
||||||
raise BadDSNException("Invalid MySQL dsn: {}".format(dsn))
|
|
||||||
"""
|
|
||||||
connection = connector.connect(
|
|
||||||
user=dsn.username,
|
|
||||||
password=dsn.password,
|
|
||||||
host=dsn.hostname,
|
|
||||||
port=dsn.port or 3306,
|
|
||||||
database=dsn.path[1:]
|
|
||||||
)
|
|
||||||
return connection
|
|
||||||
|
|
||||||
|
|
||||||
# The following two functions are taken from browserid.utils
|
|
||||||
def encode_bytes_b64(value):
|
|
||||||
return base64.urlsafe_b64encode(value).rstrip(b'=').decode('ascii')
|
|
||||||
|
|
||||||
|
|
||||||
def format_key_id(keys_changed_at, key_hash):
|
|
||||||
return "{:013d}-{}".format(
|
|
||||||
keys_changed_at,
|
|
||||||
encode_bytes_b64(key_hash),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
user_ids = {}
|
|
||||||
|
|
||||||
def read_in_token_file(filename):
|
|
||||||
global user_ids
|
|
||||||
# you can generate the token file using
|
|
||||||
# `mysql -e "select uid, email, generation, keys_changed_at, \
|
|
||||||
# client_state from users;" > users.csv`
|
|
||||||
#
|
|
||||||
# future opt: write the transmogrified file to either sqlite3
|
|
||||||
# or static files.
|
|
||||||
print("Processing token file...")
|
|
||||||
with open(filename) as csv_file:
|
|
||||||
for (uid, email, generation,
|
|
||||||
keys_changed_at, client_state) in csv.reader(
|
|
||||||
csv_file, delimiter="\t"):
|
|
||||||
if uid == 'uid':
|
|
||||||
# skip the header row.
|
|
||||||
continue
|
|
||||||
fxa_uid = email.split('@')[0]
|
|
||||||
fxa_kid = "{:013d}-{}".format(
|
|
||||||
int(keys_changed_at or generation),
|
|
||||||
base64.urlsafe_b64encode(
|
|
||||||
binascii.unhexlify(client_state)
|
|
||||||
).rstrip(b'=').decode('ascii'))
|
|
||||||
user_ids[uid] = (fxa_kid, fxa_uid)
|
|
||||||
|
|
||||||
|
|
||||||
def get_fxa_id(user_id, anon=True):
|
|
||||||
global user_ids
|
|
||||||
if user_id in user_ids:
|
|
||||||
return user_ids[user_id]
|
|
||||||
if anon:
|
|
||||||
fxa_uid = binascii.hexlify(
|
|
||||||
os.urandom(16)).decode('utf-8')
|
|
||||||
fxa_kid = binascii.hexlify(
|
|
||||||
os.urandom(16)).decode('utf-8')
|
|
||||||
user_ids[user_id] = (fxa_kid, fxa_uid)
|
|
||||||
return (fxa_kid, fxa_uid)
|
|
||||||
|
|
||||||
|
|
||||||
def dump_user_collections(schema, dsn, args):
|
|
||||||
# userid => fxa_kid
|
|
||||||
# fxa_uid
|
|
||||||
# collection => collection_id
|
|
||||||
# last_modified => modified
|
|
||||||
db = conf_db(dsn)
|
|
||||||
cursor = db.cursor()
|
|
||||||
out_file = args.output.rsplit('.', 1)
|
|
||||||
out_file_name = "{}_user_collections.{}".format(
|
|
||||||
out_file[0], out_file[1]
|
|
||||||
)
|
|
||||||
writer = DataFileWriter(
|
|
||||||
open(out_file_name, "wb"), DatumWriter(), schema)
|
|
||||||
sql = """
|
|
||||||
SELECT userid, collection, last_modified from user_collections
|
|
||||||
"""
|
|
||||||
start = time.time()
|
|
||||||
try:
|
|
||||||
cursor.execute(sql)
|
|
||||||
row = 0
|
|
||||||
for (user_id, collection_id, last_modified) in cursor:
|
|
||||||
(fxa_uid, fxa_kid) = get_fxa_id(user_id, args.anon)
|
|
||||||
try:
|
|
||||||
writer.append({
|
|
||||||
"collection_id": collection_id,
|
|
||||||
"fxa_kid": fxa_kid,
|
|
||||||
"fxa_uid": fxa_uid,
|
|
||||||
"modified": last_modified
|
|
||||||
})
|
|
||||||
except Exception as ex:
|
|
||||||
import pdb; pdb.set_trace()
|
|
||||||
print (ex)
|
|
||||||
row += 1
|
|
||||||
print(
|
|
||||||
"Dumped {} user_collection rows in {} seconds".format(
|
|
||||||
row, time.time() - start
|
|
||||||
))
|
|
||||||
finally:
|
|
||||||
writer.close()
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
|
|
||||||
def dump_rows(bso_number, chunk_offset, db, writer, args):
|
|
||||||
# bso column mapping:
|
|
||||||
# id => bso_id
|
|
||||||
# collection => collection_id
|
|
||||||
# sortindex => sortindex
|
|
||||||
# modified => modified
|
|
||||||
# payload => payload
|
|
||||||
# payload_size => NONE
|
|
||||||
# ttl => expiry
|
|
||||||
|
|
||||||
ivre = re.compile(r'("IV": ?"[^"]+")')
|
|
||||||
print("Querying.... bso{} @{}".format(bso_number, chunk_offset))
|
|
||||||
sql = """
|
|
||||||
SELECT userid, collection, id,
|
|
||||||
ttl, modified, payload,
|
|
||||||
sortindex from bso{} LIMIT {} OFFSET {}""".format(
|
|
||||||
bso_number, args.limit, chunk_offset)
|
|
||||||
cursor = db.cursor()
|
|
||||||
user = None
|
|
||||||
row_count = 0
|
|
||||||
try:
|
|
||||||
cursor.execute(sql)
|
|
||||||
print("Dumping...")
|
|
||||||
for (userid, cid, bid, exp, mod, pay, si) in cursor:
|
|
||||||
if args.anon:
|
|
||||||
replacement = encode_bytes_b64(os.urandom(16))
|
|
||||||
pay = ivre.sub('"IV":"{}"'.format(replacement), pay)
|
|
||||||
if userid != user:
|
|
||||||
(fxa_kid, fxa_uid) = get_fxa_id(userid, args.anon)
|
|
||||||
user = userid
|
|
||||||
writer.append({
|
|
||||||
"fxa_uid": fxa_uid,
|
|
||||||
"fxa_kid": fxa_kid,
|
|
||||||
"collection_id": cid,
|
|
||||||
"bso_id": bid,
|
|
||||||
"expiry": exp,
|
|
||||||
"modified": mod,
|
|
||||||
"payload": pay,
|
|
||||||
"sortindex": si})
|
|
||||||
row_count += 1
|
|
||||||
if (chunk_offset + row_count) % 1000 == 0:
|
|
||||||
print("BSO:{} Row: {}".format(bso_number, chunk_offset + row_count))
|
|
||||||
if row_count >= MAX_ROWS:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print("Deadline hit at: {} ({})".format(
|
|
||||||
chunk_offset + row_count, e))
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
return row_count
|
|
||||||
|
|
||||||
|
|
||||||
def count_rows(db, bso_num=0):
|
|
||||||
cursor = db.cursor()
|
|
||||||
try:
|
|
||||||
cursor.execute("SELECT Count(*) from bso{}".format(bso_num))
|
|
||||||
return cursor.fetchone()[0]
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
|
|
||||||
def dump_data(bso_number, schema, dsn, args):
|
|
||||||
offset = args.offset or 0
|
|
||||||
total_rows = 0
|
|
||||||
# things time out around 1_500_000 rows.
|
|
||||||
db = conf_db(dsn)
|
|
||||||
out_file = args.output.rsplit('.', 1)
|
|
||||||
row_count = count_rows(db, bso_number)
|
|
||||||
for chunk in range(
|
|
||||||
max(1, math.trunc(math.ceil(row_count / MAX_ROWS)))):
|
|
||||||
print(
|
|
||||||
"Dumping {} rows from bso#{} into chunk {}".format(
|
|
||||||
row_count, bso_number, chunk))
|
|
||||||
out_file_name = "{}_{}_{}.{}".format(
|
|
||||||
out_file[0], bso_number, hex(chunk), out_file[1]
|
|
||||||
)
|
|
||||||
writer = DataFileWriter(
|
|
||||||
open(out_file_name, "wb"), DatumWriter(), schema)
|
|
||||||
rows = dump_rows(
|
|
||||||
bso_number=bso_number,
|
|
||||||
chunk_offset=offset,
|
|
||||||
db=db,
|
|
||||||
writer=writer,
|
|
||||||
args=args)
|
|
||||||
writer.close()
|
|
||||||
if rows == 0:
|
|
||||||
break
|
|
||||||
offset = offset + rows
|
|
||||||
chunk += 1
|
|
||||||
return rows
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = get_args()
|
|
||||||
rows = 0
|
|
||||||
dsns = open(args.dsns).readlines()
|
|
||||||
schema = avro.schema.parse(open(args.schema, "rb").read())
|
|
||||||
col_schema = avro.schema.parse(open(args.col_schema, "rb").read())
|
|
||||||
if args.token_file:
|
|
||||||
read_in_token_file(args.token_file)
|
|
||||||
start = time.time()
|
|
||||||
for dsn in dsns:
|
|
||||||
print("Starting: {}".format(dsn))
|
|
||||||
try:
|
|
||||||
if not args.skip_collections:
|
|
||||||
dump_user_collections(col_schema, dsn, args)
|
|
||||||
for bso_num in range(args.start_bso, args.end_bso+1):
|
|
||||||
rows = dump_data(bso_num, schema, dsn, args)
|
|
||||||
except Exception as ex:
|
|
||||||
print("Could not process {}: {}".format(dsn, ex))
|
|
||||||
print("Dumped: {} rows in {} seconds".format(rows, time.time() - start))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,516 +0,0 @@
|
|||||||
#! venv/bin/python
|
|
||||||
|
|
||||||
# This file is historical.
|
|
||||||
# This file will attempt to copy a user from an existing mysql database
|
|
||||||
# to a spanner table. It requires access to the tokenserver db, which may
|
|
||||||
# not be available in production environments.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import base64
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from mysql import connector
|
|
||||||
from mysql.connector.errors import IntegrityError
|
|
||||||
from google.cloud import spanner
|
|
||||||
from google.api_core.exceptions import AlreadyExists
|
|
||||||
try:
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
except ImportError:
|
|
||||||
from urlparse import urlparse
|
|
||||||
|
|
||||||
SPANNER_NODE_ID = 800
|
|
||||||
META_GLOBAL_COLLECTION_ID = 6
|
|
||||||
|
|
||||||
class BadDSNException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# From server_syncstorage
|
|
||||||
class MigrationState:
|
|
||||||
UKNOWN = 0
|
|
||||||
IN_PROGRESS = 1
|
|
||||||
COMPLETE = 2
|
|
||||||
|
|
||||||
|
|
||||||
class Collections:
|
|
||||||
"""Cache spanner collection list.
|
|
||||||
|
|
||||||
The spanner collection list is the (soon to be) single source of
|
|
||||||
truth regarding collection ids.
|
|
||||||
|
|
||||||
"""
|
|
||||||
_by_name = {}
|
|
||||||
databases = None
|
|
||||||
|
|
||||||
def __init__(self, databases):
|
|
||||||
"""Get the cache list of collection ids"""
|
|
||||||
sql = """
|
|
||||||
SELECT
|
|
||||||
name, collection_id
|
|
||||||
FROM
|
|
||||||
collections;
|
|
||||||
"""
|
|
||||||
self.databases = databases
|
|
||||||
logging.debug("Fetching collections...")
|
|
||||||
with self.databases['spanner'].snapshot() as cursor:
|
|
||||||
rows = cursor.execute_sql(sql)
|
|
||||||
for row in rows:
|
|
||||||
self._by_name[row[0]] = row[1]
|
|
||||||
|
|
||||||
def get_id(self, name, cursor):
|
|
||||||
""" Get/Init the ID for a given collection """
|
|
||||||
if name in self._by_name:
|
|
||||||
return self._by_name.get(name)
|
|
||||||
result = cursor.execute_sql("""
|
|
||||||
SELECT
|
|
||||||
COALESCE(MAX(collection_id), 1)
|
|
||||||
FROM
|
|
||||||
collections""")
|
|
||||||
# preserve the "reserved" / < 100 ids.
|
|
||||||
collection_id = max(result.one()[0] + 1, 101)
|
|
||||||
cursor.insert(
|
|
||||||
table="collections",
|
|
||||||
columns=('collection_id', 'name'),
|
|
||||||
values=[
|
|
||||||
(collection_id, name)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
self._by_name[name] = collection_id
|
|
||||||
return collection_id
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="move user from sql to spanner")
|
|
||||||
parser.add_argument(
|
|
||||||
'--dsns', default="move_dsns.lst",
|
|
||||||
help="file of new line separated DSNs")
|
|
||||||
parser.add_argument(
|
|
||||||
'--users', default="move_users.lst",
|
|
||||||
help="file of new line separated users to move")
|
|
||||||
parser.add_argument(
|
|
||||||
'--token_dsn',
|
|
||||||
help="DSN to the token server database (optional)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--verbose',
|
|
||||||
action="store_true",
|
|
||||||
help="verbose logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--quiet',
|
|
||||||
action="store_true",
|
|
||||||
help="silence logging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--full",
|
|
||||||
action="store_true",
|
|
||||||
help="force a full reconcile"
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def conf_mysql(dsn):
|
|
||||||
"""create a connection to the original storage system """
|
|
||||||
logging.debug("Configuring MYSQL: {}".format(dsn))
|
|
||||||
connection = connector.connect(
|
|
||||||
user=dsn.username,
|
|
||||||
password=dsn.password,
|
|
||||||
host=dsn.hostname,
|
|
||||||
port=dsn.port or 3306,
|
|
||||||
database=dsn.path[1:]
|
|
||||||
)
|
|
||||||
return connection
|
|
||||||
|
|
||||||
|
|
||||||
def conf_spanner(dsn):
|
|
||||||
"""create a connection to the new Spanner system"""
|
|
||||||
logging.debug("Configuring SPANNER: {}".format(dsn))
|
|
||||||
path = dsn.path.split("/")
|
|
||||||
instance_id = path[-3]
|
|
||||||
database_id = path[-1]
|
|
||||||
client = spanner.Client()
|
|
||||||
instance = client.instance(instance_id)
|
|
||||||
database = instance.database(database_id)
|
|
||||||
return database
|
|
||||||
|
|
||||||
|
|
||||||
def conf_db(dsn):
|
|
||||||
"""read the list of storage definitions from the file and create
|
|
||||||
a set of connetions.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if dsn.scheme == "mysql":
|
|
||||||
return conf_mysql(dsn)
|
|
||||||
if dsn.scheme == "spanner":
|
|
||||||
return conf_spanner(dsn)
|
|
||||||
raise RuntimeError("Unknown DNS type: {}".format(dsn.scheme))
|
|
||||||
|
|
||||||
|
|
||||||
def update_token(databases, user):
|
|
||||||
"""optionally update the TokenServer storage indicating the user
|
|
||||||
is now on Spanner
|
|
||||||
|
|
||||||
"""
|
|
||||||
if 'token' not in databases:
|
|
||||||
logging.warn(
|
|
||||||
"Skipping token update for user {}...".format(user))
|
|
||||||
return
|
|
||||||
logging.info("Updating token server for user: {}".format(user))
|
|
||||||
try:
|
|
||||||
cursor = databases['token'].cursor()
|
|
||||||
cursor.execute(
|
|
||||||
"""
|
|
||||||
UPDATE
|
|
||||||
users
|
|
||||||
SET
|
|
||||||
replaced_at = {timestamp},
|
|
||||||
nodeid = {nodeid}
|
|
||||||
WHERE
|
|
||||||
uid = {uid}
|
|
||||||
""".format(
|
|
||||||
timestamp=int(time.time() * 100),
|
|
||||||
nodeid=SPANNER_NODE_ID,
|
|
||||||
uid=user)
|
|
||||||
)
|
|
||||||
databases['token'].commit()
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
|
|
||||||
# The following two functions are taken from browserid.utils
|
|
||||||
def encode_bytes_b64(value):
|
|
||||||
return base64.urlsafe_b64encode(value).rstrip(b'=').decode('ascii')
|
|
||||||
|
|
||||||
|
|
||||||
def format_key_id(keys_changed_at, key_hash):
|
|
||||||
return "{:013d}-{}".format(
|
|
||||||
keys_changed_at,
|
|
||||||
encode_bytes_b64(key_hash),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_fxa_id(databases, user):
|
|
||||||
"""generate the spanner user key values from the original storage
|
|
||||||
data.
|
|
||||||
|
|
||||||
"""
|
|
||||||
sql = """
|
|
||||||
SELECT
|
|
||||||
email, generation, keys_changed_at, client_state, node
|
|
||||||
FROM users
|
|
||||||
WHERE uid = {uid}
|
|
||||||
""".format(uid=user)
|
|
||||||
try:
|
|
||||||
cursor = databases.get('token', databases['mysql']).cursor()
|
|
||||||
cursor.execute(sql)
|
|
||||||
(email, generation, keys_changed_at,
|
|
||||||
client_state, node) = cursor.next()
|
|
||||||
fxa_uid = email.split('@')[0]
|
|
||||||
fxa_kid = format_key_id(
|
|
||||||
keys_changed_at or generation,
|
|
||||||
bytes.fromhex(client_state),
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
return (fxa_kid, fxa_uid, node)
|
|
||||||
|
|
||||||
|
|
||||||
def create_migration_table(database):
|
|
||||||
"""create the syncstorage table
|
|
||||||
|
|
||||||
This table tells the syncstorage server to return a 5xx for a
|
|
||||||
given user. It's important that syncstorage NEVER returns a
|
|
||||||
2xx result for any user that's in migration, or only does
|
|
||||||
so after deleting the meta/global BSO record so that a full
|
|
||||||
reconcile happens. (Depends on
|
|
||||||
https://github.com/mozilla-services/server-syncstorage/pull/136)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
cursor = database.cursor()
|
|
||||||
cursor.execute(
|
|
||||||
"""CREATE TABLE IF NOT EXISTS
|
|
||||||
migration (
|
|
||||||
fxa_uid VARCHAR(255) NOT NULL PRIMARY KEY,
|
|
||||||
started_at BIGINT NOT NULL,
|
|
||||||
state SMALLINT
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
database.commit()
|
|
||||||
finally:
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
|
|
||||||
def dumper(columns, values):
|
|
||||||
"""verbose column and data dumper. """
|
|
||||||
result = ""
|
|
||||||
for row in values:
|
|
||||||
for i in range(0, len(columns)):
|
|
||||||
result += " {} => {}\n".format(columns[i], row[i])
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def mark_user(databases, user, state=MigrationState.IN_PROGRESS):
|
|
||||||
""" mark a user in migration """
|
|
||||||
try:
|
|
||||||
mysql = databases['mysql'].cursor()
|
|
||||||
if state == MigrationState.IN_PROGRESS:
|
|
||||||
try:
|
|
||||||
logging.info("Marking {} as migrating...".format(user))
|
|
||||||
mysql.execute(
|
|
||||||
"INSERT INTO migration "
|
|
||||||
"(fxa_uid, started, state) VALUES (%s, %s, %s)",
|
|
||||||
(user, int(time.time()), state)
|
|
||||||
)
|
|
||||||
databases['mysql'].commit()
|
|
||||||
except IntegrityError:
|
|
||||||
return False
|
|
||||||
if state == MigrationState.COMPLETE:
|
|
||||||
logging.info("Marking {} as migrating...".format(user))
|
|
||||||
mysql.execute(
|
|
||||||
"UPDATE migration SET state = %s WHERE fxa_uid = %s",
|
|
||||||
(state, user)
|
|
||||||
)
|
|
||||||
databases['mysql'].commit()
|
|
||||||
finally:
|
|
||||||
mysql.close()
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def finish_user(databases, user):
|
|
||||||
"""mark a user migration complete"""
|
|
||||||
# This is not wrapped into `start_user` so that I can reduce
|
|
||||||
# the number of db IO, since an upsert would just work instead
|
|
||||||
# of fail out with a dupe.
|
|
||||||
mysql = databases['mysql'].cursor()
|
|
||||||
try:
|
|
||||||
logging.info("Marking {} as migrating...".format(user))
|
|
||||||
mysql.execute(
|
|
||||||
"""
|
|
||||||
UPDATE
|
|
||||||
migration
|
|
||||||
SET
|
|
||||||
state = "finished"
|
|
||||||
WHERE
|
|
||||||
fxa_uid = %s
|
|
||||||
""",
|
|
||||||
(user,)
|
|
||||||
)
|
|
||||||
databases['mysql'].commit()
|
|
||||||
except IntegrityError:
|
|
||||||
return False
|
|
||||||
finally:
|
|
||||||
mysql.close()
|
|
||||||
return True
|
|
||||||
|
|
||||||
def newSyncID():
|
|
||||||
base64.urlsafe_b64encode(os.urandom(9))
|
|
||||||
|
|
||||||
def alter_syncids(pay):
|
|
||||||
"""Alter the syncIDs for the meta/global record, which will cause a sync
|
|
||||||
when the client reconnects
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
payload = json.loads(pay)
|
|
||||||
payload['syncID'] = newSyncID()
|
|
||||||
for item in payload['engines']:
|
|
||||||
payload['engines'][item]['syncID'] = newSyncID()
|
|
||||||
return json.dumps(payload)
|
|
||||||
|
|
||||||
def move_user(databases, user, args):
|
|
||||||
"""copy user info from original storage to new storage."""
|
|
||||||
# bso column mapping:
|
|
||||||
# id => bso_id
|
|
||||||
# collection => collection_id
|
|
||||||
# sortindex => sortindex
|
|
||||||
# modified => modified
|
|
||||||
# payload => payload
|
|
||||||
# payload_size => NONE
|
|
||||||
# ttl => expiry
|
|
||||||
|
|
||||||
# user collections require a unique key.
|
|
||||||
unique_key_filter = set()
|
|
||||||
|
|
||||||
# off chance that someone else might have written
|
|
||||||
# a new collection table since the last time we
|
|
||||||
# fetched.
|
|
||||||
collections = Collections(databases)
|
|
||||||
|
|
||||||
uc_columns = (
|
|
||||||
'fxa_kid',
|
|
||||||
'fxa_uid',
|
|
||||||
'collection_id',
|
|
||||||
'modified',
|
|
||||||
)
|
|
||||||
bso_columns = (
|
|
||||||
'collection_id',
|
|
||||||
'fxa_kid',
|
|
||||||
'fxa_uid',
|
|
||||||
'bso_id',
|
|
||||||
'expiry',
|
|
||||||
'modified',
|
|
||||||
'payload',
|
|
||||||
'sortindex',
|
|
||||||
)
|
|
||||||
|
|
||||||
# Genereate the Spanner Keys we'll need.
|
|
||||||
(fxa_kid, fxa_uid, original_node) = get_fxa_id(databases, user)
|
|
||||||
if not start_user(databases, fxa_uid):
|
|
||||||
logging.error("User {} already being migrated?".format(fxa_uid))
|
|
||||||
return
|
|
||||||
|
|
||||||
# Fetch the BSO data from the original storage.
|
|
||||||
sql = """
|
|
||||||
SELECT
|
|
||||||
collections.name, bso.collection,
|
|
||||||
bso.id, bso.ttl, bso.modified, bso.payload, bso.sortindex
|
|
||||||
FROM
|
|
||||||
collections, bso
|
|
||||||
WHERE
|
|
||||||
bso.userid = %s and collections.collectionid = bso.collection
|
|
||||||
ORDER BY
|
|
||||||
modified DESC"""
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
|
|
||||||
def spanner_transact(transaction):
|
|
||||||
collection_id = collections.get_id(col, transaction)
|
|
||||||
if collection_id != cid:
|
|
||||||
logging.warn(
|
|
||||||
"Remapping collection '{}' from {} to {}".format(
|
|
||||||
col, cid, collection_id))
|
|
||||||
# columns from sync_schema3
|
|
||||||
mod_v = datetime.utcfromtimestamp(mod/1000.0)
|
|
||||||
exp_v = datetime.utcfromtimestamp(exp)
|
|
||||||
# User_Collection can only have unique values. Filter
|
|
||||||
# non-unique keys and take the most recent modified
|
|
||||||
# time. The join could be anything.
|
|
||||||
uc_key = "{}_{}_{}".format(fxa_uid, fxa_kid, col)
|
|
||||||
if uc_key not in unique_key_filter:
|
|
||||||
unique_key_filter.add(uc_key)
|
|
||||||
uc_values = [(
|
|
||||||
fxa_kid,
|
|
||||||
fxa_uid,
|
|
||||||
collection_id,
|
|
||||||
mod_v,
|
|
||||||
)]
|
|
||||||
logging.debug(
|
|
||||||
"### uc: {}".format(uc_columns, uc_values))
|
|
||||||
transaction.insert(
|
|
||||||
'user_collections',
|
|
||||||
columns=uc_columns,
|
|
||||||
values=uc_values
|
|
||||||
)
|
|
||||||
# add the BSO values.
|
|
||||||
if args.full and collection_id == META_GLOBAL_COLLECTION_ID:
|
|
||||||
pay = alter_syncids(pay)
|
|
||||||
bso_values = [[
|
|
||||||
collection_id,
|
|
||||||
fxa_kid,
|
|
||||||
fxa_uid,
|
|
||||||
bid,
|
|
||||||
exp_v,
|
|
||||||
mod_v,
|
|
||||||
pay,
|
|
||||||
sid,
|
|
||||||
]]
|
|
||||||
|
|
||||||
logging.debug(
|
|
||||||
"###bso: {}".format(dumper(bso_columns, bso_values)))
|
|
||||||
transaction.insert(
|
|
||||||
'bsos',
|
|
||||||
columns=bso_columns,
|
|
||||||
values=bso_values
|
|
||||||
)
|
|
||||||
mysql = databases['mysql'].cursor()
|
|
||||||
try:
|
|
||||||
# Note: cursor() does not support __enter__()
|
|
||||||
mysql.execute(sql, (user,))
|
|
||||||
logging.info("Processing... {} -> {}:{}".format(
|
|
||||||
user, fxa_uid, fxa_kid))
|
|
||||||
for (col, cid, bid, exp, mod, pay, sid) in mysql:
|
|
||||||
databases['spanner'].run_in_transaction(spanner_transact)
|
|
||||||
update_token(databases, user)
|
|
||||||
(ck_kid, ck_uid, ck_node) = get_fxa_id(databases, user)
|
|
||||||
if ck_node != original_node:
|
|
||||||
logging.error(
|
|
||||||
("User's Node Changed! Aborting! "
|
|
||||||
"fx_uid:{}, fx_kid:{}, node: {} => {}")
|
|
||||||
.format(user, fxa_uid, fxa_kid,
|
|
||||||
original_node, ck_node)
|
|
||||||
)
|
|
||||||
return
|
|
||||||
finish_user(databases, user)
|
|
||||||
count += 1
|
|
||||||
# Closing the with automatically calls `batch.commit()`
|
|
||||||
mark_user(user, MigrationState.COMPLETE)
|
|
||||||
except AlreadyExists:
|
|
||||||
logging.warn(
|
|
||||||
"User already imported fxa_uid:{} / fxa_kid:{}".format(
|
|
||||||
fxa_uid, fxa_kid
|
|
||||||
))
|
|
||||||
except Exception as e:
|
|
||||||
logging.error("### batch failure:", e)
|
|
||||||
finally:
|
|
||||||
# cursor may complain about unread data, this should prevent
|
|
||||||
# that warning.
|
|
||||||
for result in mysql:
|
|
||||||
pass
|
|
||||||
logging.debug("Closing...")
|
|
||||||
mysql.close()
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def move_data(databases, users, args):
|
|
||||||
"""iterate over provided users and move their data from old to new"""
|
|
||||||
for user in users:
|
|
||||||
rows = move_user(databases, user.strip(), args)
|
|
||||||
return rows
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
start = time.time()
|
|
||||||
args = get_args()
|
|
||||||
log_level = logging.INFO
|
|
||||||
if args.quiet:
|
|
||||||
log_level = logging.ERROR
|
|
||||||
if args.verbose:
|
|
||||||
log_level = logging.DEBUG
|
|
||||||
logging.basicConfig(
|
|
||||||
stream=sys.stdout,
|
|
||||||
level=log_level,
|
|
||||||
)
|
|
||||||
dsns = open(args.dsns).readlines()
|
|
||||||
users = open(args.users).readlines()
|
|
||||||
databases = {}
|
|
||||||
for line in dsns:
|
|
||||||
dsn = urlparse(line.strip())
|
|
||||||
databases[dsn.scheme] = conf_db(dsn)
|
|
||||||
if args.token_dsn:
|
|
||||||
dsn = urlparse(args.token_dsn)
|
|
||||||
databases['token'] = conf_db(dsn)
|
|
||||||
if not databases.get('mysql') or not databases.get('spanner'):
|
|
||||||
RuntimeError("Both mysql and spanner dsns must be specified")
|
|
||||||
|
|
||||||
# create the migration table if it's not already present.
|
|
||||||
# This table is used by the sync storage server to force a 500 return
|
|
||||||
# for a user in migration.
|
|
||||||
create_migration_table(databases['mysql'])
|
|
||||||
|
|
||||||
logging.info("Starting:")
|
|
||||||
rows = move_data(databases, users, args)
|
|
||||||
logging.info(
|
|
||||||
"Moved: {} rows in {} seconds".format(
|
|
||||||
rows or 0, time.time() - start))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,4 +0,0 @@
|
|||||||
wheel
|
|
||||||
avro-python3
|
|
||||||
google-cloud-spanner
|
|
||||||
mysql-connector
|
|
@ -1,13 +0,0 @@
|
|||||||
{"namespace": "bso.avro",
|
|
||||||
"type": "record",
|
|
||||||
"name": "bso",
|
|
||||||
"fields": [
|
|
||||||
{"name": "fxa_uid", "type": ["null", "string"]},
|
|
||||||
{"name": "fxa_kid", "type": ["null", "string"]},
|
|
||||||
{"name": "collection_id", "type": ["null", "long"]},
|
|
||||||
{"name": "bso_id", "type": "string"},
|
|
||||||
{"name": "expiry", "type": "long"},
|
|
||||||
{"name": "modified", "type": "long"},
|
|
||||||
{"name": "payload", "type": "string"},
|
|
||||||
{"name": "sortindex", "type": ["null", "long"]}
|
|
||||||
]}
|
|
@ -1,3 +0,0 @@
|
|||||||
wheel
|
|
||||||
google-cloud-spanner
|
|
||||||
mysql-connector
|
|
Loading…
Reference in New Issue
Block a user