refactor: remove user migration utils (#1710)

2025-08-06 20:06:57 +02:00 · 2025-06-23 16:05:57 -04:00 · 2025-06-23 16:05:57 -04:00 · f01c21fef4
commit f01c21fef4
parent 1dc421474d
12 changed files with 0 additions and 2396 deletions
--- a/tools/README.md
+++ b/tools/README.md
@ -4,8 +4,6 @@ See each directory for details:
 * [hawk](hawk) - a tool for generating test HAWK authorization headers
 * [spanner](spanner) - Google Cloud Platform Spanner tools for maintenance and testing
 * [user_migration](user_migration) - scripts for dumping and moving user data from SQL to Spanner
 ## Installation
 These tools are mostly written in python. It is recommended that you create a commonly shared virtual environment using something like:
--- a/tools/user_migration/README.md
+++ b/tools/user_migration/README.md
@ -1,100 +0,0 @@
 # User Migration Script
 This is a workspace for testing user migration from the old databases
 to the new durable one.
 There are several candidate scripts that you can use.
 These progress off of each other in order to provide cached results.
 There are a few base files you'll want to declare:
 * *dsns* - a file containing the mysql and spanner DSNs for the users.
  Each DSN should be on a single line. Currently only one DSN of a
 given type is permitted.
 (e.g.)
 ```text
 mysql://test:test@localhost/syncstorage
 spanner://projects/sync-spanner-dev-225401/instances/spanner-test/databases/sync_schema3
 ```
 * *users.csv* - a mysql dump of the token database. This file is only needed if the `--deanon` de-anonymization flag is set. By default, data is anononymized to prevent accidental movement.
 You can produce this file from the following:
 ```bash
 mysql -e "select uid, email, generation, keys_changed_at, \
 client_state from users;" > users.csv`
 ```
 The script will automatically skip the title row, and presumes that fields are tab separated.
 With those files you can now run:
 ```bash
 gen_fxa_users.py
 ```
 which will take the `users.csv` raw data and generate a
 `fxa_users_{date}.lst` file.
 ```bash
 gen_bso_users.py --bso_num #
 ```
 which will automatically read in the `fxa_users_{date}.lst` file,
 connect to the mysql database, and geneate a list of sorted users
 taken from the `bso#` table. This will create the
 `bso_users_{bso_num}_{date}.lst` file
 and finally:
 ```bash
 GOOGLE_APPLICATION_CREDENTIALS=credentials.json migrate_node.py \
    [--start_bso=0] \
    [--end_bso=19] \
    [--user_percent 1:100]
 ```
 Which will read the `bso_users_#_{date}.lst` files and move the users
 based on `--user_percent`
 More importantly `--help` is your friend. feel free to use liberally.
 ## installation
 ```bash
 virtualenv venv && venv/bin/pip install -r requirements.txt
 ```
 ## running
 Since you will be connecting to the GCP Spanner API, you will need to have set the `GOOGLE_APPLICATION_CREDENTIALS` env var before running these scripts. This environment variable should point to the exported Google Credentials acquired from the GCP console.
 The scripts will take the following actions:
 1. fetch all users from a given node.
 1. compare and port all user_collections over (NOTE: this may involve remapping collecitonid values.)
 1. begin copying over user information from mysql to spanner.
 Overall performance may be improved by "batching" BSOs to different
 processes using:
 `--start_bso` the BSO database (defaults to 0, inclusive) to begin
 copying from
 `--end_bso` the final BSO database (defaults to 19, inclusive) to copy
 from.
 Note that these are inclusive values. So to split between two
 processes, you would want to use
 ```bash
 migrate_node.py --start_bso=0 --end_bso=9 &
 migrate_node.py --start_bso=10 --end_bso=19 &
 ```
 (As short hand for this case, you could also do:
 ```
 migrate_node.py --end_bso=9 &
 migrate_node.py --start_bso=10 &
 ```
 and let the defaults handle the rest.)
--- a/tools/user_migration/fix_collections.sql
+++ b/tools/user_migration/fix_collections.sql
@ -1,15 +0,0 @@
 INSERT IGNORE INTO weave0.collections (name, collectionid) VALUES
        ("clients", 1),
        ("crypto", 2),
        ("forms", 3),
        ("history", 4),
        ("keys", 5),
        ("meta", 6),
        ("bookmarks", 7),
        ("prefs", 8),
        ("tabs", 9),
        ("passwords", 10),
        ("addons", 11),
        ("addresses", 12),
        ("creditcards", 13),
        ("reserved", 99);
--- a/tools/user_migration/gen_bso_users.py
+++ b/tools/user_migration/gen_bso_users.py
@ -1,290 +0,0 @@
 #! venv/bin/python
 #
 import argparse
 import logging
 import threading
 import csv
 import sys
 import os
 from datetime import datetime
 from mysql import connector
 try:
    from urllib.parse import urlparse
 except ImportError:
    from urlparse import urlparse
 def tick(count):
    mark = None
    if count % 1000 == 0:
        mark = "|"
    elif count % 100 == 0:
        mark = "."
    level = logging.getLogger().getEffectiveLevel()
    if mark and level > logging.DEBUG:
        print(mark, end='', flush=True)
 class Report:
    bso = "init"
    _failure = None
    _success = None
    def __init__(self, args, lock=None):
        self._success_file = args.success_file
        self._failure_file = args.failure_file
        self._lock = lock
    def success(self, uid):
        if self._lock:
            lock = self._lock.acquire()
        if not self._success:
            self._success = open(self._success_file, "w")
        self._success.write("{}\t{}\n".format(self.bso, uid))
    def fail(self, uid, reason=None):
        if self._lock:
            lock = self._lock.acquire()
        if not self._failure:
            self._failure = open(self._failure_file, "w")
        logging.debug("Skipping user {}".format(uid))
        self._failure.write("{}\t{}\t{}\n".format(self.bso, uid, reason or ""))
    def close(self):
        self._success.close()
        self._failure.close()
 class BSO_Users:
    """User information from Tokenserver database.
    Can be constructed from
    ``mysql -e "select uid, email, generation, keys_changed_at, \
       client_state from users;" > users.csv`
    """
    users = {}
    anon = False
    def __init__(self, args, report, dsn):
        self.args = args
        self.dsn = dsn
        self.report = report
        self.get_users(args)
    def get_users(self, args):
        try:
            logging.info("Reading fxa_user data.")
            with open(args.fxa_users_file) as csv_file:
                line = 0
                for (uid, fxa_uid, fxa_kid) in csv.reader(
                    csv_file, delimiter="\t"
                ):
                    if uid == "uid":
                        continue
                    tick(line)
                    logging.debug("Read: {} {}:{}".format(
                        uid, fxa_uid, fxa_kid))
                    self.users[int(uid)] = (fxa_uid, fxa_kid)
                    line += 1
            print("")
        except Exception as ex:
            logging.error(
                "Unexpected error",
                exc_info=ex
            )
            self.report.fail(uid, "Unexpected error {}".format(ex))
    def run(self, bso_num):
        connection = self.conf_mysql(self.dsn)
        out_users = []
        bso_file = self.args.output_file
        bso_file = bso_file.replace("#", str(bso_num))
        logging.info("Fetching users from BSO db into {}".format(
            bso_file,
        ))
        output_file = open(bso_file, "w")
        try:
            cursor = connection.cursor()
            sql = ("""select userid, count(*) as count from bso{}"""
                   """ group by userid order by userid""".format(
                       bso_num))
            if self.args.user_range:
                (offset, limit) = self.args.user_range.split(':')
                sql = "{} limit {} offset {}".format(
                    sql, limit, offset)
            cursor.execute(sql)
            for (uid, count) in cursor:
                try:
                    (fxa_uid, fxa_kid) = self.users.get(uid)
                    if self.args.hoard_limit and count > self.args.hoard_limit:
                        logging.warn(
                            "User {} => {}:{} has too "
                            "many items: {} ".format(
                                uid, fxa_uid, fxa_kid, count
                            )
                        )
                        self.report.fail(uid, "hoarder {}".format(count))
                        continue
                    out_users.append((uid, fxa_uid, fxa_kid))
                except TypeError:
                    self.report.fail(uid, "not found")
                    logging.error(
                        ("User {} not found in "
                            "tokenserver data".format(uid)))
            if self.args.sort_users:
                logging.info("Sorting users...")
                out_users.sort(key=lambda tup: tup[1])
            # Take a block of percentage of the users.
            logging.info("Writing out {} users".format(len(out_users)))
            line = 0
            output_file.write("uid\tfxa_uid\tfxa_kid\n")
            for (uid, fxa_uid, fxa_kid) in out_users:
                output_file.write("{}\t{}\t{}\n".format(
                    uid, fxa_uid, fxa_kid
                ))
                tick(line)
                line += 1
            output_file.flush()
            print("")
        except connector.errors.ProgrammingError as ex:
            logging.error(ex)
            output_file.close()
            os.unlink(bso_file)
        except Exception as e:
            logging.error("### Exception {}:{}", exc_info=e)
            output_file.close()
            os.unlink(bso_file)
        finally:
            cursor.close()
    def conf_mysql(self, dsn):
        """create a connection to the original storage system """
        logging.debug("Configuring MYSQL: {}".format(dsn))
        return connector.connect(
            user=dsn.username,
            password=dsn.password,
            host=dsn.hostname,
            port=dsn.port or 3306,
            database=dsn.path[1:]
        )
 def get_args():
    pid = os.getpid()
    parser = argparse.ArgumentParser(
        description="Generate BSO user list")
    parser.add_argument(
        '--dsns', default="move_dsns.lst",
        help="file of new line separated DSNs")
    parser.add_argument(
        '--start_bso',
        default=0,
        help="Start of BSO range (default 0)"
    )
    parser.add_argument(
        '--end_bso',
        default=19,
        help="End of BSO range inclusive (default 19)"
    )
    parser.add_argument(
        '--bso_num',
        type=int,
        default=0,
        help="Only read from this bso (default num)"
    )
    parser.add_argument(
        '--output_file',
        default="bso_users_#_{}.lst".format(
            datetime.now().strftime("%Y_%m_%d")),
        help="List of BSO users."
    )
    parser.add_argument(
        '--verbose',
        action="store_true",
        help="verbose logging"
    )
    parser.add_argument(
        '--quiet',
        action="store_true",
        help="silence logging"
    )
    parser.add_argument(
        '--user_range',
        help="Range of users to extract (offset:limit)"
    )
    parser.add_argument(
        '--hoard_limit', type=int, default=0,
        help="reject any user with more than this count of records"
    )
    parser.add_argument(
        '--sort_users', action="store_true",
        help="Sort the user"
        )
    parser.add_argument(
        '--success_file', default="success_bso_user.log".format(pid),
        help="File of successfully migrated userids"
    )
    parser.add_argument(
        '--failure_file', default="failure_bso_user.log".format(pid),
        help="File of unsuccessfully migrated userids"
    )
    parser.add_argument(
        '--fxa_users_file',
        default="fxa_users_{}.lst".format(datetime.now().strftime("%Y_%m_%d")),
        help="List of pre-generated FxA users."
    )
    parser.add_argument(
        '--threading',
        action="store_true",
        help="use threading"
    )
    return parser.parse_args()
 def main():
    threads = []
    args = get_args()
    log_level = logging.INFO
    if args.quiet:
        log_level = logging.ERROR
    if args.verbose:
        log_level = logging.DEBUG
    logging.basicConfig(
        stream=sys.stdout,
        level=log_level,
    )
    if args.bso_num is not None:
        args.start_bso = args.end_bso = args.bso_num
    locker = None
    if args.threading:
        locker =  threading.Lock()
    report = Report(args, locker)
    dsns = open(args.dsns).readlines()
    db_dsn = None
    for line in dsns:
        dsn = urlparse(line.strip())
        if 'mysql' in dsn.scheme:
            db_dsn = dsn
    if not db_dsn:
        RuntimeError("mysql dsn must be specified")
    bso = BSO_Users(args, report, db_dsn)
    # threading is currently in process.
    if args.threading:
        for bso_num in range(int(args.start_bso), int(args.end_bso) + 1):
            t = threading.Thread(target=bso.run, args=(bso_num,))
            threads.append(t)
            t.start()
    else:
        bso.run(args.bso_num)
    for thread in threads:
        thread.join()
 if __name__ == "__main__":
    main()
--- a/tools/user_migration/gen_fxa_users.py
+++ b/tools/user_migration/gen_fxa_users.py
@ -1,203 +0,0 @@
 #! venv/bin/python
 #
 import argparse
 import logging
 import base64
 import binascii
 import csv
 import sys
 import os
 from datetime import datetime
 def tick(count):
    mark = None
    if count % 1000 == 0:
        mark = "|"
    elif count % 100 == 0:
        mark = "."
    level = logging.getLogger().getEffectiveLevel()
    if mark and level > logging.DEBUG:
        print(mark, end='', flush=True)
 class Report:
    bso = "init"
    _success = None
    _failure = None
    def __init__(self, args):
        self._success_file = args.success_file
        self._failure_file = args.failure_file
    def success(self, uid):
        if not self._success:
            self._success = open(self._success_file, "w")
        self._success.write("{}\t{}\n".format(self.bso, uid))
    def fail(self, uid, reason=None):
        if not self._failure:
            self._failure = open(self._failure_file, "w")
        logging.debug("Skipping user {}".format(uid))
        self._failure.write("{}\t{}\t{}\n".format(self.bso, uid, reason or ""))
    def close(self):
        self._success.close()
        self._failure.close()
 class FxA_Generate:
    """User information from Tokenserver database.
    Can be constructed from
    ``mysql -e "select uid, email, generation, keys_changed_at, \
       client_state from users;" > users.csv`
    """
    users = []
    anon = False
    def __init__(self, args, report):
        logging.info("Processing token file: {} into {}".format(
            args.users_file,
            args.output_file,
        ))
        output_file = open(args.output_file, "w")
        output_file.write("uid\tfxa_uid\tfxa_kid\n")
        if not os.path.isfile(args.users_file):
            raise IOError("{} not found".format(args.users_file))
        with open(args.users_file) as csv_file:
            try:
                line = 0
                success = 0
                for (uid, email, generation,
                     keys_changed_at, client_state) in csv.reader(
                        csv_file, delimiter="\t"):
                    line += 1
                    if uid == 'uid':
                        # skip the header row.
                        continue
                    tick(line)
                    try:
                        fxa_uid = email.split('@')[0]
                        try:
                            keys_changed_at = int(keys_changed_at)
                        except ValueError:
                            keys_changed_at = 0
                        try:
                            generation = int(generation)
                        except ValueError:
                            generation = 0
                        if (keys_changed_at or generation) == 0:
                            logging.warn(
                                "user {} has no k_c_a or "
                                "generation value".format(
                                    uid))
                        # trap for actually blank values
                        if client_state is None or client_state == '':
                            logging.error(
                                "User {} "
                                "has an invalid, empty client state".format(
                                    uid
                                )
                            )
                            report.fail(uid, "invalid client state")
                            continue
                        try:
                            client_state = binascii.unhexlify(client_state)
                        except binascii.Error:
                            logging.error(
                                "User {} has "
                                "invalid client state: {}".format(
                                    uid, client_state
                                ))
                            report.fail(uid, "bad client state")
                            continue
                        fxa_kid = self.format_key_id(
                            int(keys_changed_at or generation),
                            client_state
                            )
                        logging.debug("Adding user {} => {} , {}".format(
                            uid, fxa_uid, fxa_kid
                        ))
                        output_file.write(
                            "{}\t{}\t{}\n".format(
                                uid, fxa_uid, fxa_kid))
                        success += 1
                    except Exception as ex:
                        logging.error(
                            "User {} Unexpected error".format(uid),
                            exc_info=ex)
                        report.fail(uid, "unexpected error")
            except Exception as ex:
                logging.critical("Error in fxa file around line {}".format(
                    line), exc_info=ex)
        print("")
        logging.info("Processed {} users, {} successful".format(line, success))
    # The following two functions are taken from browserid.utils
    def encode_bytes_b64(self, value):
        return base64.urlsafe_b64encode(value).rstrip(b'=').decode('ascii')
    def format_key_id(self, keys_changed_at, key_hash):
        return "{:013d}-{}".format(
            keys_changed_at,
            self.encode_bytes_b64(key_hash),
        )
 def get_args():
    pid = os.getpid()
    parser = argparse.ArgumentParser(
        description="Generate FxA user id info")
    parser.add_argument(
        '--users_file',
        default="users.csv",
        help="FXA User info in CSV format (default users.csv)"
    )
    parser.add_argument(
        '--output_file',
        default="fxa_users_{}.lst".format(datetime.now().strftime("%Y_%m_%d")),
        help="List of FxA users."
    )
    parser.add_argument(
        '--verbose',
        action="store_true",
        help="verbose logging"
    )
    parser.add_argument(
        '--quiet',
        action="store_true",
        help="silence logging"
    )
    parser.add_argument(
        '--success_file', default="success_fxa_user.log".format(pid),
        help="File of successfully migrated userids"
    )
    parser.add_argument(
        '--failure_file', default="failure_fxa_user.log".format(pid),
        help="File of unsuccessfully migrated userids"
    )
    return parser.parse_args()
 def main():
    args = get_args()
    log_level = logging.INFO
    if args.quiet:
        log_level = logging.ERROR
    if args.verbose:
        log_level = logging.DEBUG
    logging.basicConfig(
        stream=sys.stdout,
        level=log_level,
    )
    report = Report(args)
    FxA_Generate(args, report)
 if __name__ == "__main__":
    main()
--- a/tools/user_migration/migrate_node.py
+++ b/tools/user_migration/migrate_node.py
@ -1,827 +0,0 @@
 #! venv/bin/python
 # painfully stupid script to check out dumping mysql databases to avro.
 # Avro is basically "JSON" for databases. It's not super complicated & it has
 # issues (one of which is that it requires Python2).
 #
 #
 import argparse
 import logging
 import base64
 import binascii
 import csv
 import sys
 import math
 import json
 import os
 import time
 from datetime import datetime
 import grpc
 from mysql import connector
 from google.cloud import spanner
 from google.cloud.spanner_v1 import param_types
 from google.api_core.exceptions import AlreadyExists, InvalidArgument
 try:
    from urllib.parse import urlparse
 except ImportError:
    from urlparse import urlparse
 META_GLOBAL_COLLECTION_NAME = "meta"
 MAX_ROWS = 1500000
 class BadDSNException(Exception):
    pass
 def tick(count):
    mark = None
    if not count % 100:
        mark = "."
    if not count % 1000:
        mark = "|"
    level = logging.getLogger().getEffectiveLevel()
    if mark and level > logging.DEBUG:
        print(mark, end='', flush=True)
 class Report:
    bso = "init"
    _success = None
    _failure = None
    def __init__(self, args):
        self._success_file = args.success_file
        self._failure_file = args.failure_file
    def success(self, uid):
        if not self._success:
            self._success = open(self._success_file, "w")
        self._success.write("{}\t{}\n".format(self.bso, uid))
    def fail(self, uid, reason=None):
        if not self._failure:
            self._failure = open(self._failure_file, "w")
        logging.debug("Skipping user {}".format(uid))
        self._failure.write("{}\t{}\t{}\n".format(self.bso, uid, reason or ""))
    def close(self):
        self._success.close()
        self._failure.close()
    def read_failure(self, input):
        start = 19
        end = 0
        users = []
        for line in open(input).readlines():
            line = line.strip()
            if line[0] in "#;/":
                continue
            (bso, user, reason) = line.split("\t")
            start = min(start, int(bso))
            end = max(end, int(bso))
            users.append(user)
        return (int(start), int(end), users)
 class FXA_info:
    """User information from Tokenserver database.
    Can be constructed from
    ``mysql -e "select uid, email, generation, keys_changed_at, \
       client_state from users;" > users.csv`
    """
    users = {}
    anon = False
    def __init__(self, users_file, args, report):
        if args.anon:
            self.anon = True
            return
        logging.info("Reading users file: {}".format(users_file))
        if not os.path.isfile(users_file):
            raise IOError("{} not found".format(users_file))
        with open(users_file) as csv_file:
            try:
                line = 0
                for (uid, fxa_uid, fxa_kid) in csv.reader(
                        csv_file, delimiter="\t"):
                    line += 1
                    tick(line)
                    if uid == 'uid':
                        # skip the header row.
                        continue
                    if args.user:
                        if int(uid) not in args.user:
                            continue
                    try:
                        self.users[int(uid)] = (fxa_kid, fxa_uid)
                    except Exception as ex:
                        logging.error(
                            "User {} Unexpected error".format(uid),
                            exc_info=ex)
                        report.fail(uid, "unexpected error")
            except Exception as ex:
                logging.critical("Error in fxa file around line {}".format(
                    line), exc_info=ex)
    def get(self, userid):
        if userid in self.users:
            return self.users[userid]
        if self.anon:
            fxa_uid = "fake_" + binascii.hexlify(
                os.urandom(11)).decode('utf-8')
            fxa_kid = "fake_" + binascii.hexlify(
                os.urandom(11)).decode('utf-8')
            self.users[userid] = (fxa_kid, fxa_uid)
            return (fxa_kid, fxa_uid)
 class Collections:
    """Cache spanner collection list.
    The spanner collection list is the (soon to be) single source of
    truth regarding collection ids.
    """
    _by_name = {
        "clients": 1,
        "crypto": 2,
        "forms": 3,
        "history": 4,
        "keys": 5,
        "meta": 6,
        "bookmarks": 7,
        "prefs": 8,
        "tabs": 9,
        "passwords": 10,
        "addons": 11,
        "addresses": 12,
        "creditcards": 13,
        "reserved": 100,
    }
    spanner = None
    def __init__(self, databases):
        """merge the mysql user_collections into spanner"""
        sql = """
        SELECT
            DISTINCT uc.collection, cc.name
        FROM
            user_collections as uc,
            collections as cc
        WHERE
            uc.collection = cc.collectionid
        ORDER BY
            uc.collection
        """
        cursor = databases['mysql'].cursor()
        def transact(transaction, values):
            transaction.insert(
                'collections',
                columns=('collection_id', 'name'),
                values=values)
        self.spanner = databases['spanner']
        try:
            # fetch existing:
            with self.spanner.snapshot() as scursor:
                rows = scursor.execute_sql(
                    "select collection_id, name from collections")
                for (collection_id, name) in rows:
                    logging.debug("Loading collection: {} => {}".format(
                        name, collection_id
                    ))
                    self._by_name[name] = collection_id
            cursor.execute(sql)
            for (collection_id, name) in cursor:
                if name not in self._by_name:
                    logging.debug("Adding collection: {} => {}".format(
                        name, collection_id
                    ))
                    values = [(collection_id, name)]
                    self._by_name[name] = collection_id
                    # Since a collection may collide, do these one at a time.
                    try:
                        self.spanner.run_in_transaction(transact, values)
                    except AlreadyExists:
                        logging.info(
                            "Skipping already present collection {}".format(
                                values
                            ))
                        pass
        finally:
            cursor.close()
    def get(self, name, collection_id=None):
        """Fetches the collection_id"""
        id = self._by_name.get(name)
        if id is None:
            logging.warn(
                "Unknown collection {}:{} encountered!".format(
                    name, collection_id))
            # it would be swell to add these to the collection table,
            # but that would mean
            # an imbedded spanner transaction, and that's not allowed.
            return None
        return id
 def conf_mysql(dsn):
    """create a connection to the original storage system """
    logging.debug("Configuring MYSQL: {}".format(dsn))
    connection = connector.connect(
        user=dsn.username,
        password=dsn.password,
        host=dsn.hostname,
        port=dsn.port or 3306,
        database=dsn.path[1:],
        auth_plugin="mysql_native_password"
    )
    return connection
 def conf_spanner(dsn):
    """create a connection to the new Spanner system"""
    logging.debug("Configuring SPANNER: {}".format(dsn))
    path = dsn.path.split("/")
    instance_id = path[-3]
    database_id = path[-1]
    client = spanner.Client()
    instance = client.instance(instance_id)
    database = instance.database(database_id)
    return database
 def conf_db(dsn):
    """read the list of storage definitions from the file and create
    a set of connetions.
     """
    if "mysql" in dsn.scheme:
        return conf_mysql(dsn)
    if "spanner" in dsn.scheme:
        return conf_spanner(dsn)
    raise RuntimeError("Unknown DSN type: {}".format(dsn.scheme))
 def dumper(columns, values):
    """verbose column and data dumper. """
    result = ""
    for row in values:
        for i in range(0, len(columns)):
            result += " {} => {}\n".format(columns[i], row[i])
    return result
 def newSyncID():
    base64.urlsafe_b64encode(os.urandom(9))
 def alter_syncids(pay):
    """Alter the syncIDs for the meta/global record, which will cause a sync
    when the client reconnects
    """
    payload = json.loads(pay)
    payload['syncID'] = newSyncID()
    for item in payload['engines']:
        payload['engines'][item]['syncID'] = newSyncID()
    return json.dumps(payload)
 def divvy(biglist, count):
    """Partition a list into a set of equally sized slices"""
    lists = []
    biglen = len(biglist)
    start = 0
    while start < biglen:
        lists.append(biglist[start:min(start+count, biglen)])
        start += count
    return lists
 def move_user(databases, user_data, collections, fxa, bso_num, args, report):
    """copy user info from original storage to new storage."""
    # bso column mapping:
    # id => bso_id
    # collection => collection_id
    # sortindex => sortindex
    # modified => modified
    # payload => payload
    # payload_size => NONE
    # ttl => expiry
    uc_columns = (
        'fxa_uid',
        'fxa_kid',
        'collection_id',
        'modified',
    )
    bso_columns = (
            'collection_id',
            'fxa_uid',
            'fxa_kid',
            'bso_id',
            'expiry',
            'modified',
            'payload',
            'sortindex',
    )
    (uid, fxa_uid, fxa_kid) = user_data
    # Fetch the BSO data from the original storage.
    sql = """
    SELECT
        collections.name, bso.collection, uc.last_modified,
        bso.id, bso.ttl, bso.modified, bso.payload, bso.sortindex
    FROM
        bso{} as bso,
        collections,
        user_collections as uc
    WHERE
        bso.userid = %s
            and collections.collectionid = bso.collection
            and uc.collection = bso.collection
            and uc.userid = bso.userid
            and bso.ttl > unix_timestamp()
    ORDER BY
        bso.collection, bso.id""".format(bso_num)
    unique_key_filter = set()
    def spanner_transact_wipe_user(
            transaction, fxa_uid, fxa_kid, args):
        result = transaction.execute_sql(
            """
        SELECT
            uc.collection_id, c.name
        FROM
            user_collections as uc
        LEFT JOIN
            collections as c
        ON
            uc.collection_id = c.collection_id
        WHERE
            uc.fxa_uid = @fxa_uid
        AND uc.fxa_kid = @fxa_kid
            """,
            params=dict(fxa_uid=fxa_uid, fxa_kid=fxa_kid),
            param_types=dict(fxa_uid=param_types.STRING, fxa_kid=param_types.STRING),
        )
        cols = [(row[0], row[1]) for row in result]
        if not args.dryrun:
            logging.debug("Wiping user, collections: {}".format(cols))
            transaction.execute_update(
                """
            DELETE FROM
                user_collections
            WHERE
                fxa_uid = @fxa_uid
            AND fxa_kid = @fxa_kid
            """,
                params=dict(fxa_uid=fxa_uid, fxa_kid=fxa_kid),
                param_types=dict(fxa_uid=param_types.STRING, fxa_kid=param_types.STRING),
            )
        else:
            logging.debug("Not wiping user, collections: {}".format(cols))
    def spanner_transact_uc(
            transaction, data, fxa_uid, fxa_kid, args):
        # user collections require a unique key.
        for (col, cid, cmod, bid, exp, bmod, pay, sid) in data:
            collection_id = collections.get(col, cid)
            if collection_id is None:
                continue
            # columns from sync_schema3
            # user_collections modified should come directly from
            # mysql user_collections.last_modified
            mod_v = datetime.utcfromtimestamp(cmod/1000.0)
            # User_Collection can only have unique values. Filter
            # non-unique keys and take the most recent modified
            # time. The join could be anything.
            uc_key = "{}_{}_{}".format(fxa_uid, fxa_kid, col)
            if uc_key not in unique_key_filter:
                uc_values = [(
                    fxa_uid,
                    fxa_kid,
                    collection_id,
                    mod_v,
                )]
                if not args.dryrun:
                    transaction.insert(
                        'user_collections',
                        columns=uc_columns,
                        values=uc_values
                    )
                else:
                    logging.debug("not writing {} => {}".format(
                        uc_columns, uc_values))
                unique_key_filter.add(uc_key)
    def spanner_transact_bso(transaction, data, fxa_uid, fxa_kid, args):
        count = 0
        bso_values = []
        for (col, cid, cmod, bid, exp, bmod, pay, sid) in data:
            collection_id = collections.get(col, cid)
            if collection_id is None:
                continue
            if collection_id != cid:
                logging.debug(
                    "Remapping collection '{}' from {} to {}".format(
                        col, cid, collection_id))
            # columns from sync_schema3
            mod_v = datetime.utcfromtimestamp(bmod/1000.0)
            exp_v = datetime.utcfromtimestamp(exp)
            # add the BSO values.
            if args.full and col == META_GLOBAL_COLLECTION_NAME:
                pay = alter_syncids(pay)
            bso_values.append([
                    collection_id,
                    fxa_uid,
                    fxa_kid,
                    bid,
                    exp_v,
                    mod_v,
                    pay,
                    sid,
            ])
            count += 1
        if not args.dryrun:
            logging.debug(
                "###bso{} {}".format(
                    bso_num,
                    dumper(bso_columns, bso_values)
                )
            )
            for i in range(0, 5):
                try:
                    transaction.insert(
                        'bsos',
                        columns=bso_columns,
                        values=bso_values
                    )
                    break
                except grpc._channel_._InactiveRpcError as ex:
                    logging.warn(
                        "Could not write record (attempt {})".format(i),
                        exc_info=ex)
                    time.sleep(.5)
        else:
            logging.debug("not writing {} => {}".format(
                bso_columns, bso_values))
        return count
    cursor = databases['mysql'].cursor()
    count = 0
    try:
        # Note: cursor() does not support __enter__()
        logging.info("Processing... {} -> {}:{}".format(
            uid, fxa_uid, fxa_kid))
        cursor.execute(sql, (uid,))
        data = []
        abort_col = None
        abort_count = None
        col_count = 0
        if args.abort:
            (abort_col, abort_count) = args.abort.split(":")
            abort_count = int(abort_count)
        for row in cursor:
            logging.debug("col: {}".format(row[0]))
            if abort_col and int(row[1]) == int(abort_col):
                col_count += 1
                if col_count > abort_count:
                    logging.debug("Skipping col: {}: {} of {}".format(
                        row[0], col_count, abort_count))
                    continue
            data.append(row)
        if args.abort:
            logging.info("Skipped {} of {} rows for {}".format(
                abort_count, col_count, abort_col
            ))
        logging.info(
            "Moving {} items for user {} => {}:{}".format(
                len(data), uid, fxa_uid, fxa_kid))
        if args.wipe_user:
            databases['spanner'].run_in_transaction(
                spanner_transact_wipe_user,
                fxa_uid,
                fxa_kid,
                args,
            )
        for bunch in divvy(data, args.chunk or 1000):
            # Occasionally, there is a batch fail because a
            # user collection is not found before a bso is written.
            # to solve that, divide the UC updates from the
            # BSO updates.
            # Run through the list of UserCollection updates
            databases['spanner'].run_in_transaction(
                spanner_transact_uc,
                bunch,
                fxa_uid,
                fxa_kid,
                args,
            )
            count += databases['spanner'].run_in_transaction(
                spanner_transact_bso,
                bunch,
                fxa_uid,
                fxa_kid,
                args,
            )
            if args.ms_delay > 0:
                logging.debug(
                    "Sleeping for {} seconds".format(args.ms_delay * .01))
                time.sleep(args.ms_delay * .01)
    except AlreadyExists:
        logging.warn(
            "User {} already imported fxa_uid:{} / fxa_kid:{}".format(
                uid, fxa_uid, fxa_kid
            ))
        report.fail(uid, "exists")
        return count
    except InvalidArgument as ex:
        report.fail(uid, "exists")
        if "already inserted" in ex.args[0]:
            logging.warn(
                "User {} already imported fxa_uid:{} / fxa_kid:{}".format(
                    uid, fxa_uid, fxa_kid
                ))
            return count
        else:
            raise
    except Exception as ex:
        report.fail(uid, "unexpected batch error")
        logging.error("Unexpected Batch failure: {}:{}".format(
            fxa_uid, fxa_kid), exc_info=ex)
    finally:
        # cursor may complain about unread data, this should prevent
        # that warning.
        for result in cursor:
            pass
        cursor.close()
    report.success(uid)
    return count
 def get_percentage_users(users, user_percent):
    (block, percentage) = map(
        int, user_percent.split(':'))
    total_count = len(users)
    chunk_size = max(
        1, math.floor(
            total_count * (int(percentage) * .01)))
    chunk_count = math.ceil(total_count / chunk_size)
    chunk_start = max(block - 1, 0) * chunk_size
    chunk_end = min(chunk_count, block) * chunk_size
    if chunk_size * chunk_count > total_count:
        if block >= chunk_count - 1:
            chunk_end = total_count
    users = users[chunk_start:chunk_end]
    logging.debug(
        "moving users: {} to {}".format(
            chunk_start, chunk_end))
    return users
 def get_users(args, databases, fxa, bso_num, report):
    """Fetch the user information from the Tokenserver Dump """
    users = []
    try:
        if args.user:
            for uid in args.user:
                try:
                    (fxa_kid, fxa_uid) = fxa.get(uid)
                    users.append((uid, fxa_uid, fxa_kid))
                except TypeError:
                    logging.error(
                        "User {} not found in "
                        "tokenserver data.".format(uid))
                    report.fail(uid, "not found")
        else:
            try:
                bso_users_file = args.bso_users_file.replace('#', str(bso_num))
                with open(bso_users_file) as bso_file:
                    line = 0
                    for row in csv.reader(
                        bso_file, delimiter="\t"
                    ):
                        if row[0] == "uid":
                            continue
                        users.append(row)
                        tick(line)
                        line += 1
            except Exception as ex:
                logging.critical("Error reading BSO data", exc_info=ex)
                exit(-1)
            if args.user_percent:
                users = get_percentage_users(users, args.user_percent)
    except Exception as ex:
        logging.critical("Unexpected Error moving database:", exc_info=ex)
        exit(-1)
    return users
 def move_database(databases, collections, bso_num, fxa, args, report):
    """iterate over provided users and move their data from old to new"""
    start = time.time()
    # off chance that someone else might have written
    # a new collection table since the last time we
    # fetched.
    rows = 0
    users = get_users(args, databases, fxa, bso_num, report)
    logging.info("Moving {} users".format(len(users)))
    for user in users:
        rows += move_user(
            databases=databases,
            user_data=user,
            collections=collections,
            fxa=fxa,
            bso_num=bso_num,
            args=args,
            report=report)
    logging.info("Finished BSO #{} ({} rows) in {} seconds".format(
        bso_num,
        rows,
        math.ceil(time.time() - start)
    ))
    return rows
 def get_args():
    pid = os.getpid()
    today = datetime.now().strftime("%Y_%m_%d")
    parser = argparse.ArgumentParser(
        description="move user from sql to spanner")
    parser.add_argument(
        '--dsns', default="move_dsns.lst",
        help="file of new line separated DSNs")
    parser.add_argument(
        '--verbose',
        action="store_true",
        help="verbose logging"
    )
    parser.add_argument(
        '--quiet',
        action="store_true",
        help="silence logging"
    )
    parser.add_argument(
        '--offset', type=int, default=0,
        help="UID to start at (default 0)")
    parser.add_argument(
        "--full",
        action="store_true",
        help="force a full reconcile"
    )
    parser.add_argument(
        '--anon', action='store_true',
        help="Anonymize the user data"
    )
    parser.add_argument(
        '--start_bso', default=0,
        type=int,
        help="start dumping BSO database (default: 0)"
    )
    parser.add_argument(
        '--end_bso',
        type=int, default=19,
        help="last BSO database to dump (default: 19)"
    )
    parser.add_argument(
        '--bso_num',
        type=int,
        help="only move this bso (equivalent to start_bso == end_bso)"
    )
    parser.add_argument(
        '--write_chunk',
        dest="chunk",
        default=1666,
        help="how many rows per transaction for spanner (default: 1666)"
    )
    parser.add_argument(
        '--user',
        type=str,
        help="BSO#:userId[,userid,...] to move."
    )
    parser.add_argument(
        '--retry_file',
        type=str,
        help="Copy of failure file to read user IDs to retry."
    )
    parser.add_argument(
        '--wipe_user',
        action="store_true",
        help="delete any pre-existing --user data on spanner before the migration"
    )
    parser.add_argument(
        '--bso_users_file',
        default="bso_users_#_{}.lst".format(today),
        help="name of the generated BSO user file. "
            "(Will use bso number for `#` if present; "
            "default: bso_users_#_{}.lst)".format(today),
    )
    parser.add_argument(
        '--fxa_users_file',
        default="fxa_users_{}.lst".format(today),
        help="List of pre-generated FxA users. Only needed if specifying"
            " the `--user` option; default: fxa_users_{}.lst)".format(today)
    )
    parser.add_argument(
        '--dryrun',
        action="store_true",
        help="Do not write user records to spanner"
    )
    parser.add_argument(
        '--abort',
        type=str,
        help="abort data in col after #rows (e.g. history:10)"
    )
    parser.add_argument(
        "--user_percent", default="1:100",
        help=("Offset and percent of users from this BSO"
              "to move (e.g. 2:50 moves the second 50%%) "
              "(default 1:100)")
    )
    parser.add_argument(
        '--ms_delay', type=int, default=0,
        help="inject a sleep between writes to spanner as a throttle"
    )
    parser.add_argument(
        '--success_file', default="success_{}.log".format(pid),
        help="File of successfully migrated userids"
    )
    parser.add_argument(
        '--failure_file', default="failure_{}.log".format(pid),
        help="File of unsuccessfully migrated userids"
    )
    return parser.parse_args()
 def main():
    args = get_args()
    log_level = logging.INFO
    if args.quiet:
        log_level = logging.ERROR
    if args.verbose:
        log_level = logging.DEBUG
    logging.basicConfig(
        stream=sys.stdout,
        level=log_level,
    )
    report = Report(args)
    dsns = open(args.dsns).readlines()
    databases = {}
    rows = 0
    if args.user:
        args.user_percent = "1:100"
        (bso, userid) = args.user.split(':')
        args.start_bso = int(bso)
        args.end_bso = int(bso)
        user_list = []
        for id in userid.split(','):
            user_list.append(int(id))
        args.user = user_list
    elif args.wipe_user:
        raise RuntimeError("--wipe_user requires --user")
    if args.retry_file:
        (args.start_bso, args.end_bso, args.user) = report.read_failure(
            args.retry_file)
    if args.bso_num is not None:
        args.start_bso = args.end_bso = args.bso_num
    for line in dsns:
        dsn = urlparse(line.strip())
        scheme = dsn.scheme
        if 'mysql' in dsn.scheme:
            scheme = 'mysql'
        databases[scheme] = conf_db(dsn)
    if not databases.get('mysql') or not databases.get('spanner'):
        raise RuntimeError("Both mysql and spanner dsns must be specified")
    fxa_info = FXA_info(args.fxa_users_file, args, report)
    collections = Collections(databases)
    logging.info("Starting:")
    if args.dryrun:
        logging.info("=== DRY RUN MODE ===")
    start = time.time()
    for bso_num in range(args.start_bso, args.end_bso+1):
        logging.info("Moving users in bso # {}".format(bso_num))
        report.bso = bso_num
        rows += move_database(
            databases, collections, bso_num, fxa_info, args, report)
    logging.info(
        "Moved: {} rows in {} seconds".format(
            rows or 0, time.time() - start))
 if __name__ == "__main__":
    main()
--- a/tools/user_migration/old/dump_avro.py
+++ b/tools/user_migration/old/dump_avro.py
@ -1,111 +0,0 @@
 #! venv/bin/python
 # painfully stupid script to check out dumping a spanner database to avro.
 # Avro is basically "JSON" for databases. It's not super complicated & it has
 # issues (one of which is that it requires Python2).
 # test run Dumped 2770783 rows in 457.566066027 seconds and produced a
 # roughly 6.5GB file.
 #
 # Spanner also has a Deadline issue where it will kill a db connection after
 # so many minutes (5?). Might be better to just divvy things up into clusters
 # and have threads handle transporting records over.
 #
 import avro.schema
 import argparse
 import time
 from avro.datafile import DataFileWriter
 from avro.io import DatumWriter
 from google.cloud import spanner
 def get_args():
    parser = argparse.ArgumentParser(description="dump spanner to arvo files")
    parser.add_argument(
        '--instance_id', default="spanner-test",
        help="Spanner instance name")
    parser.add_argument(
        '--database_id',  default="sync_schema3",
        help="Spanner database name")
    parser.add_argument(
        '--schema', default="sync.avsc",
        help="Database schema description")
    parser.add_argument(
        '--output', default="output.avso",
        help="Output file")
    parser.add_argument(
        '--limit', type=int, default=1500000,
        help="Limit to n rows")
    return parser.parse_args()
 def conf_spanner(args):
    spanner_client = spanner.Client()
    instance = spanner_client.instance(args.instance_id)
    database = instance.database(args.database_id)
    return database
 def dump_rows(offset, db, writer, args):
    print("Querying.... @{}".format(offset))
    sql = """
    SELECT collection_id, fxa_kid, fxa_uid, bso_id,
    UNIX_MICROS(expiry), UNIX_MICROS(modified), payload,
    sortindex from bsos LIMIT {} OFFSET {}""".format(args.limit, offset)
    try:
        with db.snapshot() as snapshot:
            result = snapshot.execute_sql(sql)
            print("Dumping...")
            for row in result:
                writer.append({
                    "collection_id": row[0],
                    "fxa_kid": row[1],
                    "fxa_uid": row[2],
                    "bso_id": row[3],
                    "expiry": row[4],
                    "modified": row[5],
                    "payload": row[6],
                    "sortindex": row[7]})
                offset += 1
                if offset % 1000 == 0:
                    print("Row: {}".format(offset))
            return offset
    except Exception as ex:
        print("Deadline hit at: {} ({})".format(offset, ex))
        return offset
 def count_rows(db):
    with db.snapshot() as snapshot:
        result = snapshot.execute_sql("SELECT Count(*) from bsos")
        return result.one()[0]
 def dump_data(args, schema):
    offset = 0
    # things time out around 1_500_000 rows.
    db = conf_spanner(args)
    writer = DataFileWriter(
        open(args.output, "wb"), DatumWriter(), schema)
    row_count = count_rows(db)
    print("Dumping {} rows".format(row_count))
    while offset < row_count:
        old_offset = offset
        offset = dump_rows(offset=offset, db=db, writer=writer, args=args)
        if offset == old_offset:
            break
    writer.close()
    return row_count
 def main():
    start = time.time()
    args = get_args()
    schema = avro.schema.parse(open(args.schema, "rb").read())
    rows = dump_data(args, schema)
    print("Dumped: {} rows in {} seconds".format(rows, time.time() - start))
 if __name__ == "__main__":
    main()
--- a/tools/user_migration/old/dump_mysql.py
+++ b/tools/user_migration/old/dump_mysql.py
@ -1,312 +0,0 @@
 #! venv/bin/python
 # This file is historical.
 # We're using `migrate_node.py`, however this file may be useful in the future
 # if we determine there's a problem with directly transcribing the data from
 # mysql to spanner.
 #
 # painfully stupid script to check out dumping mysql databases to avro.
 # Avro is basically "JSON" for databases. It's not super complicated & it has
 # issues.
 #
 import avro.schema
 import argparse
 import binascii
 import csv
 import base64
 import math
 import time
 import os
 import random
 import re
 from avro.datafile import DataFileWriter
 from avro.io import DatumWriter
 from mysql import connector
 try:
    from urllib.parse import urlparse
 except:
    from urlparse import urlparse
 MAX_ROWS=1500000
 class BadDSNException(Exception):
    pass
 def get_args():
    parser = argparse.ArgumentParser(description="dump spanner to arvo files")
    parser.add_argument(
        '--dsns', default="dsns.lst",
        help="file of new line separated DSNs")
    parser.add_argument(
        '--schema', default="sync.avsc",
        help="Database schema description")
    parser.add_argument(
        '--col_schema', default="user_collection.avsc",
        help="User Collection schema description"
    )
    parser.add_argument(
        '--output', default="output.avso",
        help="Output file")
    parser.add_argument(
        '--limit', type=int, default=1500000,
        help="Limit each read chunk to n rows")
    parser.add_argument(
        '--offset', type=int, default=0,
        help="UID to start at")
    parser.add_argument(
        '--deanon', action='store_false',
        dest='anon',
        help="Anonymize the user data"
    )
    parser.add_argument(
        '--start_bso', default=0,
        type=int,
        help="start dumping BSO database"
    )
    parser.add_argument(
        '--end_bso',
        type=int, default=19,
        help="last BSO database to dump"
    )
    parser.add_argument(
        '--token_file',
        default='users.csv',
        help="token user database dump CSV"
    )
    parser.add_argument(
        '--skip_collections', action='store_false',
        help="skip user_collections table"
    )
    return parser.parse_args()
 def conf_db(dsn):
    dsn = urlparse(dsn)
    """
    if dsn.scheme != "mysql":
        raise BadDSNException("Invalid MySQL dsn: {}".format(dsn))
    """
    connection = connector.connect(
        user=dsn.username,
        password=dsn.password,
        host=dsn.hostname,
        port=dsn.port or 3306,
        database=dsn.path[1:]
    )
    return connection
 # The following two functions are taken from browserid.utils
 def encode_bytes_b64(value):
    return base64.urlsafe_b64encode(value).rstrip(b'=').decode('ascii')
 def format_key_id(keys_changed_at, key_hash):
    return "{:013d}-{}".format(
        keys_changed_at,
        encode_bytes_b64(key_hash),
    )
 user_ids = {}
 def read_in_token_file(filename):
    global user_ids
    # you can generate the token file using
    # `mysql -e "select uid, email, generation, keys_changed_at, \
    #  client_state from users;" > users.csv`
    #
    # future opt: write the transmogrified file to either sqlite3
    # or static files.
    print("Processing token file...")
    with open(filename) as csv_file:
        for (uid, email, generation,
             keys_changed_at, client_state) in csv.reader(
                 csv_file, delimiter="\t"):
            if uid == 'uid':
                # skip the header row.
                continue
            fxa_uid = email.split('@')[0]
            fxa_kid = "{:013d}-{}".format(
                int(keys_changed_at or generation),
                base64.urlsafe_b64encode(
                    binascii.unhexlify(client_state)
                    ).rstrip(b'=').decode('ascii'))
            user_ids[uid] = (fxa_kid, fxa_uid)
 def get_fxa_id(user_id, anon=True):
    global user_ids
    if user_id in user_ids:
        return user_ids[user_id]
    if anon:
        fxa_uid = binascii.hexlify(
            os.urandom(16)).decode('utf-8')
        fxa_kid = binascii.hexlify(
            os.urandom(16)).decode('utf-8')
        user_ids[user_id] = (fxa_kid, fxa_uid)
        return (fxa_kid, fxa_uid)
 def dump_user_collections(schema, dsn, args):
    # userid => fxa_kid
    #           fxa_uid
    # collection => collection_id
    # last_modified => modified
    db = conf_db(dsn)
    cursor = db.cursor()
    out_file = args.output.rsplit('.', 1)
    out_file_name = "{}_user_collections.{}".format(
        out_file[0], out_file[1]
    )
    writer = DataFileWriter(
        open(out_file_name, "wb"), DatumWriter(), schema)
    sql = """
    SELECT userid, collection, last_modified from user_collections
    """
    start = time.time()
    try:
        cursor.execute(sql)
        row = 0
        for (user_id, collection_id, last_modified) in cursor:
            (fxa_uid, fxa_kid) = get_fxa_id(user_id, args.anon)
            try:
                writer.append({
                    "collection_id": collection_id,
                    "fxa_kid": fxa_kid,
                    "fxa_uid": fxa_uid,
                    "modified": last_modified
                })
            except Exception as ex:
                import pdb; pdb.set_trace()
                print (ex)
            row += 1
        print(
            "Dumped {} user_collection rows in {} seconds".format(
                row, time.time() - start
            ))
    finally:
        writer.close()
        cursor.close()
 def dump_rows(bso_number, chunk_offset, db, writer, args):
    # bso column mapping:
    # id => bso_id
    # collection => collection_id
    # sortindex => sortindex
    # modified => modified
    # payload => payload
    # payload_size => NONE
    # ttl => expiry
    ivre = re.compile(r'("IV": ?"[^"]+")')
    print("Querying.... bso{} @{}".format(bso_number, chunk_offset))
    sql = """
    SELECT userid, collection, id,
    ttl, modified, payload,
    sortindex from bso{} LIMIT {} OFFSET {}""".format(
        bso_number, args.limit, chunk_offset)
    cursor = db.cursor()
    user = None
    row_count = 0
    try:
        cursor.execute(sql)
        print("Dumping...")
        for (userid, cid, bid, exp, mod, pay, si) in cursor:
            if args.anon:
                replacement = encode_bytes_b64(os.urandom(16))
                pay = ivre.sub('"IV":"{}"'.format(replacement), pay)
            if userid != user:
                (fxa_kid, fxa_uid) = get_fxa_id(userid, args.anon)
                user = userid
            writer.append({
                "fxa_uid": fxa_uid,
                "fxa_kid": fxa_kid,
                "collection_id": cid,
                "bso_id": bid,
                "expiry": exp,
                "modified": mod,
                "payload": pay,
                "sortindex": si})
            row_count += 1
            if (chunk_offset + row_count) % 1000 == 0:
                print("BSO:{} Row: {}".format(bso_number, chunk_offset + row_count))
            if row_count >= MAX_ROWS:
                break
    except Exception as e:
        print("Deadline hit at: {} ({})".format(
            chunk_offset + row_count, e))
    finally:
        cursor.close()
    return row_count
 def count_rows(db, bso_num=0):
    cursor = db.cursor()
    try:
        cursor.execute("SELECT Count(*) from bso{}".format(bso_num))
        return cursor.fetchone()[0]
    finally:
        cursor.close()
 def dump_data(bso_number, schema, dsn, args):
    offset = args.offset or 0
    total_rows = 0
    # things time out around 1_500_000 rows.
    db = conf_db(dsn)
    out_file = args.output.rsplit('.', 1)
    row_count = count_rows(db, bso_number)
    for chunk in range(
        max(1, math.trunc(math.ceil(row_count / MAX_ROWS)))):
        print(
            "Dumping {} rows from bso#{} into chunk {}".format(
                row_count, bso_number, chunk))
        out_file_name = "{}_{}_{}.{}".format(
            out_file[0], bso_number, hex(chunk), out_file[1]
        )
        writer = DataFileWriter(
            open(out_file_name, "wb"), DatumWriter(), schema)
        rows = dump_rows(
            bso_number=bso_number,
            chunk_offset=offset,
            db=db,
            writer=writer,
            args=args)
        writer.close()
        if rows == 0:
            break
        offset = offset + rows
        chunk += 1
    return rows
 def main():
    args = get_args()
    rows = 0
    dsns = open(args.dsns).readlines()
    schema = avro.schema.parse(open(args.schema, "rb").read())
    col_schema = avro.schema.parse(open(args.col_schema, "rb").read())
    if args.token_file:
        read_in_token_file(args.token_file)
    start = time.time()
    for dsn in dsns:
        print("Starting: {}".format(dsn))
        try:
            if not args.skip_collections:
                dump_user_collections(col_schema, dsn, args)
            for bso_num in range(args.start_bso, args.end_bso+1):
                rows = dump_data(bso_num, schema, dsn, args)
        except Exception as ex:
            print("Could not process {}: {}".format(dsn, ex))
    print("Dumped: {} rows in {} seconds".format(rows, time.time() - start))
 if __name__ == "__main__":
    main()
--- a/tools/user_migration/old/migrate_user.py
+++ b/tools/user_migration/old/migrate_user.py
@ -1,516 +0,0 @@
 #! venv/bin/python
 # This file is historical.
 # This file will attempt to copy a user from an existing mysql database
 # to a spanner table. It requires access to the tokenserver db, which may
 # not be available in production environments.
 #
 #
 import argparse
 import logging
 import base64
 import sys
 import os
 import time
 from datetime import datetime
 from mysql import connector
 from mysql.connector.errors import IntegrityError
 from google.cloud import spanner
 from google.api_core.exceptions import AlreadyExists
 try:
    from urllib.parse import urlparse
 except ImportError:
    from urlparse import urlparse
 SPANNER_NODE_ID = 800
 META_GLOBAL_COLLECTION_ID = 6
 class BadDSNException(Exception):
    pass
 # From server_syncstorage
 class MigrationState:
    UKNOWN = 0
    IN_PROGRESS = 1
    COMPLETE = 2
 class Collections:
    """Cache spanner collection list.
    The spanner collection list is the (soon to be) single source of
    truth regarding collection ids.
    """
    _by_name = {}
    databases = None
    def __init__(self, databases):
        """Get the cache list of collection ids"""
        sql = """
        SELECT
            name, collection_id
        FROM
            collections;
        """
        self.databases = databases
        logging.debug("Fetching collections...")
        with self.databases['spanner'].snapshot() as cursor:
            rows = cursor.execute_sql(sql)
            for row in rows:
                self._by_name[row[0]] = row[1]
    def get_id(self, name, cursor):
        """ Get/Init the ID for a given collection """
        if name in self._by_name:
            return self._by_name.get(name)
        result = cursor.execute_sql("""
            SELECT
                COALESCE(MAX(collection_id), 1)
            FROM
                collections""")
        # preserve the "reserved" / < 100 ids.
        collection_id = max(result.one()[0] + 1, 101)
        cursor.insert(
            table="collections",
            columns=('collection_id', 'name'),
            values=[
                (collection_id, name)
            ]
        )
        self._by_name[name] = collection_id
        return collection_id
 def get_args():
    parser = argparse.ArgumentParser(
        description="move user from sql to spanner")
    parser.add_argument(
        '--dsns', default="move_dsns.lst",
        help="file of new line separated DSNs")
    parser.add_argument(
        '--users', default="move_users.lst",
        help="file of new line separated users to move")
    parser.add_argument(
        '--token_dsn',
        help="DSN to the token server database (optional)"
    )
    parser.add_argument(
        '--verbose',
        action="store_true",
        help="verbose logging"
    )
    parser.add_argument(
        '--quiet',
        action="store_true",
        help="silence logging"
    )
    parser.add_argument(
        "--full",
        action="store_true",
        help="force a full reconcile"
    )
    return parser.parse_args()
 def conf_mysql(dsn):
    """create a connection to the original storage system """
    logging.debug("Configuring MYSQL: {}".format(dsn))
    connection = connector.connect(
        user=dsn.username,
        password=dsn.password,
        host=dsn.hostname,
        port=dsn.port or 3306,
        database=dsn.path[1:]
    )
    return connection
 def conf_spanner(dsn):
    """create a connection to the new Spanner system"""
    logging.debug("Configuring SPANNER: {}".format(dsn))
    path = dsn.path.split("/")
    instance_id = path[-3]
    database_id = path[-1]
    client = spanner.Client()
    instance = client.instance(instance_id)
    database = instance.database(database_id)
    return database
 def conf_db(dsn):
    """read the list of storage definitions from the file and create
    a set of connetions.
     """
    if dsn.scheme == "mysql":
        return conf_mysql(dsn)
    if dsn.scheme == "spanner":
        return conf_spanner(dsn)
    raise RuntimeError("Unknown DNS type: {}".format(dsn.scheme))
 def update_token(databases, user):
    """optionally update the TokenServer storage indicating the user
    is now on Spanner
    """
    if 'token' not in databases:
        logging.warn(
            "Skipping token update for user {}...".format(user))
        return
    logging.info("Updating token server for user: {}".format(user))
    try:
        cursor = databases['token'].cursor()
        cursor.execute(
            """
            UPDATE
                users
            SET
                replaced_at = {timestamp},
                nodeid = {nodeid}
            WHERE
            uid = {uid}
            """.format(
                timestamp=int(time.time() * 100),
                nodeid=SPANNER_NODE_ID,
                uid=user)
            )
        databases['token'].commit()
    finally:
        cursor.close()
 # The following two functions are taken from browserid.utils
 def encode_bytes_b64(value):
    return base64.urlsafe_b64encode(value).rstrip(b'=').decode('ascii')
 def format_key_id(keys_changed_at, key_hash):
    return "{:013d}-{}".format(
        keys_changed_at,
        encode_bytes_b64(key_hash),
    )
 def get_fxa_id(databases, user):
    """generate the spanner user key values from the original storage
    data.
    """
    sql = """
        SELECT
            email, generation, keys_changed_at, client_state, node
        FROM users
            WHERE uid = {uid}
    """.format(uid=user)
    try:
        cursor = databases.get('token', databases['mysql']).cursor()
        cursor.execute(sql)
        (email, generation, keys_changed_at,
         client_state, node) = cursor.next()
        fxa_uid = email.split('@')[0]
        fxa_kid = format_key_id(
            keys_changed_at or generation,
            bytes.fromhex(client_state),
        )
    finally:
        cursor.close()
    return (fxa_kid, fxa_uid, node)
 def create_migration_table(database):
    """create the syncstorage table
    This table tells the syncstorage server to return a 5xx for a
    given user. It's important that syncstorage NEVER returns a
    2xx result for any user that's in migration, or only does
    so after deleting the meta/global BSO record so that a full
    reconcile happens. (Depends on
    https://github.com/mozilla-services/server-syncstorage/pull/136)
    """
    try:
        cursor = database.cursor()
        cursor.execute(
            """CREATE TABLE IF NOT EXISTS
                migration (
                    fxa_uid VARCHAR(255) NOT NULL PRIMARY KEY,
                    started_at BIGINT NOT NULL,
                    state SMALLINT
                )
            """)
        database.commit()
    finally:
        cursor.close()
 def dumper(columns, values):
    """verbose column and data dumper. """
    result = ""
    for row in values:
        for i in range(0, len(columns)):
            result += " {} => {}\n".format(columns[i], row[i])
    return result
 def mark_user(databases, user, state=MigrationState.IN_PROGRESS):
    """ mark a user in migration """
    try:
        mysql = databases['mysql'].cursor()
        if state == MigrationState.IN_PROGRESS:
            try:
                logging.info("Marking {} as migrating...".format(user))
                mysql.execute(
                    "INSERT INTO migration "
                    "(fxa_uid, started, state) VALUES (%s, %s, %s)",
                    (user, int(time.time()), state)
                )
                databases['mysql'].commit()
            except IntegrityError:
                return False
        if state == MigrationState.COMPLETE:
            logging.info("Marking {} as migrating...".format(user))
            mysql.execute(
                "UPDATE migration SET state = %s WHERE fxa_uid = %s",
                (state, user)
            )
            databases['mysql'].commit()
    finally:
        mysql.close()
    return True
 def finish_user(databases, user):
    """mark a user migration complete"""
    # This is not wrapped into `start_user` so that I can reduce
    # the number of db IO, since an upsert would just work instead
    # of fail out with a dupe.
    mysql = databases['mysql'].cursor()
    try:
        logging.info("Marking {} as migrating...".format(user))
        mysql.execute(
            """
            UPDATE
                migration
            SET
                state = "finished"
            WHERE
                fxa_uid = %s
            """,
            (user,)
        )
        databases['mysql'].commit()
    except IntegrityError:
        return False
    finally:
        mysql.close()
    return True
 def newSyncID():
    base64.urlsafe_b64encode(os.urandom(9))
 def alter_syncids(pay):
    """Alter the syncIDs for the meta/global record, which will cause a sync
    when the client reconnects
    """
    payload = json.loads(pay)
    payload['syncID'] = newSyncID()
    for item in payload['engines']:
        payload['engines'][item]['syncID'] = newSyncID()
    return json.dumps(payload)
 def move_user(databases, user, args):
    """copy user info from original storage to new storage."""
    # bso column mapping:
    # id => bso_id
    # collection => collection_id
    # sortindex => sortindex
    # modified => modified
    # payload => payload
    # payload_size => NONE
    # ttl => expiry
    # user collections require a unique key.
    unique_key_filter = set()
    # off chance that someone else might have written
    # a new collection table since the last time we
    # fetched.
    collections = Collections(databases)
    uc_columns = (
        'fxa_kid',
        'fxa_uid',
        'collection_id',
        'modified',
    )
    bso_columns = (
            'collection_id',
            'fxa_kid',
            'fxa_uid',
            'bso_id',
            'expiry',
            'modified',
            'payload',
            'sortindex',
    )
    # Genereate the Spanner Keys we'll need.
    (fxa_kid, fxa_uid, original_node) = get_fxa_id(databases, user)
    if not start_user(databases, fxa_uid):
        logging.error("User {} already being migrated?".format(fxa_uid))
        return
    # Fetch the BSO data from the original storage.
    sql = """
    SELECT
        collections.name, bso.collection,
        bso.id, bso.ttl, bso.modified, bso.payload, bso.sortindex
    FROM
        collections, bso
    WHERE
        bso.userid = %s and collections.collectionid = bso.collection
    ORDER BY
        modified DESC"""
    count = 0
    def spanner_transact(transaction):
        collection_id = collections.get_id(col, transaction)
        if collection_id != cid:
            logging.warn(
                "Remapping collection '{}' from {} to {}".format(
                    col, cid, collection_id))
        # columns from sync_schema3
        mod_v = datetime.utcfromtimestamp(mod/1000.0)
        exp_v = datetime.utcfromtimestamp(exp)
        # User_Collection can only have unique values. Filter
        # non-unique keys and take the most recent modified
        # time. The join could be anything.
        uc_key = "{}_{}_{}".format(fxa_uid, fxa_kid, col)
        if uc_key not in unique_key_filter:
            unique_key_filter.add(uc_key)
            uc_values = [(
                fxa_kid,
                fxa_uid,
                collection_id,
                mod_v,
            )]
            logging.debug(
                "### uc: {}".format(uc_columns, uc_values))
            transaction.insert(
                'user_collections',
                columns=uc_columns,
                values=uc_values
            )
        # add the BSO values.
        if args.full and collection_id == META_GLOBAL_COLLECTION_ID:
            pay = alter_syncids(pay)
        bso_values = [[
                collection_id,
                fxa_kid,
                fxa_uid,
                bid,
                exp_v,
                mod_v,
                pay,
                sid,
        ]]
        logging.debug(
            "###bso: {}".format(dumper(bso_columns, bso_values)))
        transaction.insert(
            'bsos',
            columns=bso_columns,
            values=bso_values
        )
    mysql = databases['mysql'].cursor()
    try:
        # Note: cursor() does not support __enter__()
        mysql.execute(sql, (user,))
        logging.info("Processing... {} -> {}:{}".format(
            user, fxa_uid, fxa_kid))
        for (col, cid, bid, exp, mod, pay, sid) in mysql:
            databases['spanner'].run_in_transaction(spanner_transact)
            update_token(databases, user)
            (ck_kid, ck_uid, ck_node) = get_fxa_id(databases, user)
            if ck_node != original_node:
                logging.error(
                    ("User's Node Changed! Aborting! "
                    "fx_uid:{}, fx_kid:{}, node: {} => {}")
                    .format(user, fxa_uid, fxa_kid,
                            original_node, ck_node)
                )
                return
            finish_user(databases, user)
            count += 1
            # Closing the with automatically calls `batch.commit()`
        mark_user(user, MigrationState.COMPLETE)
    except AlreadyExists:
        logging.warn(
            "User already imported fxa_uid:{} / fxa_kid:{}".format(
                fxa_uid, fxa_kid
            ))
    except Exception as e:
        logging.error("### batch failure:", e)
    finally:
        # cursor may complain about unread data, this should prevent
        # that warning.
        for result in mysql:
            pass
        logging.debug("Closing...")
        mysql.close()
    return count
 def move_data(databases, users, args):
    """iterate over provided users and move their data from old to new"""
    for user in users:
        rows = move_user(databases, user.strip(), args)
    return rows
 def main():
    start = time.time()
    args = get_args()
    log_level = logging.INFO
    if args.quiet:
        log_level = logging.ERROR
    if args.verbose:
        log_level = logging.DEBUG
    logging.basicConfig(
        stream=sys.stdout,
        level=log_level,
    )
    dsns = open(args.dsns).readlines()
    users = open(args.users).readlines()
    databases = {}
    for line in dsns:
        dsn = urlparse(line.strip())
        databases[dsn.scheme] = conf_db(dsn)
    if args.token_dsn:
        dsn = urlparse(args.token_dsn)
        databases['token'] = conf_db(dsn)
    if not databases.get('mysql') or not databases.get('spanner'):
        RuntimeError("Both mysql and spanner dsns must be specified")
    # create the migration table if it's not already present.
    # This table is used by the sync storage server to force a 500 return
    # for a user in migration.
    create_migration_table(databases['mysql'])
    logging.info("Starting:")
    rows = move_data(databases, users, args)
    logging.info(
        "Moved: {} rows in {} seconds".format(
            rows or 0, time.time() - start))
 if __name__ == "__main__":
    main()
--- a/tools/user_migration/old/requirements.txt
+++ b/tools/user_migration/old/requirements.txt
@ -1,4 +0,0 @@
 wheel
 avro-python3
 google-cloud-spanner
 mysql-connector
--- a/tools/user_migration/old/sync.avsc
+++ b/tools/user_migration/old/sync.avsc
@ -1,13 +0,0 @@
 {"namespace": "bso.avro",
 "type": "record",
 "name": "bso",
 "fields": [
    {"name": "fxa_uid", "type": ["null", "string"]},
    {"name": "fxa_kid", "type": ["null", "string"]},
    {"name": "collection_id", "type": ["null", "long"]},
    {"name": "bso_id", "type": "string"},
    {"name": "expiry", "type": "long"},
    {"name": "modified", "type": "long"},
    {"name": "payload", "type": "string"},
    {"name": "sortindex", "type": ["null", "long"]}
 ]}
--- a/tools/user_migration/requirements.txt
+++ b/tools/user_migration/requirements.txt
@ -1,3 +0,0 @@
 wheel
 google-cloud-spanner
 mysql-connector