syncstorage-rs/tools/user_migration/old/dump_avro.py
JR Conlin ecfca9fdf5
feat: more user_migration stuff (#450)
* feat: more user_migration stuff

* create script to move users by node directly
* moved old scripts to `old` directory (for historic reasons, as well as
possible future use)
* cleaned up README
* try to solve the `parent row` error
an intermittent error may be responsible from one of two things:
1) a transaction failure resulted in a premature add of the unique key
to the UC filter.
2) an internal spanner update error resulting from trying to write the
bso before the user_collection row was written.
* Added "fix_collections.sql" script to update collections table to add
well known collections for future rectification.
* returned collection name lookup
* add "--user" arg to set bso and user id
* add `--dryrun` mode
2020-03-02 20:26:07 -08:00

112 lines
3.4 KiB
Python

#! venv/bin/python
# painfully stupid script to check out dumping a spanner database to avro.
# Avro is basically "JSON" for databases. It's not super complicated & it has
# issues (one of which is that it requires Python2).
# test run Dumped 2770783 rows in 457.566066027 seconds and produced a
# roughly 6.5GB file.
#
# Spanner also has a Deadline issue where it will kill a db connection after
# so many minutes (5?). Might be better to just divvy things up into clusters
# and have threads handle transporting records over.
#
import avro.schema
import argparse
import time
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
from google.cloud import spanner
def get_args():
parser = argparse.ArgumentParser(description="dump spanner to arvo files")
parser.add_argument(
'--instance_id', default="spanner-test",
help="Spanner instance name")
parser.add_argument(
'--database_id', default="sync_schema3",
help="Spanner database name")
parser.add_argument(
'--schema', default="sync.avsc",
help="Database schema description")
parser.add_argument(
'--output', default="output.avso",
help="Output file")
parser.add_argument(
'--limit', type=int, default=1500000,
help="Limit to n rows")
return parser.parse_args()
def conf_spanner(args):
spanner_client = spanner.Client()
instance = spanner_client.instance(args.instance_id)
database = instance.database(args.database_id)
return database
def dump_rows(offset, db, writer, args):
print("Querying.... @{}".format(offset))
sql = """
SELECT collection_id, fxa_kid, fxa_uid, bso_id,
UNIX_MICROS(expiry), UNIX_MICROS(modified), payload,
sortindex from bsos LIMIT {} OFFSET {}""".format(args.limit, offset)
try:
with db.snapshot() as snapshot:
result = snapshot.execute_sql(sql)
print("Dumping...")
for row in result:
writer.append({
"collection_id": row[0],
"fxa_kid": row[1],
"fxa_uid": row[2],
"bso_id": row[3],
"expiry": row[4],
"modified": row[5],
"payload": row[6],
"sortindex": row[7]})
offset += 1
if offset % 1000 == 0:
print("Row: {}".format(offset))
return offset
except Exception as ex:
print("Deadline hit at: {} ({})".format(offset, ex))
return offset
def count_rows(db):
with db.snapshot() as snapshot:
result = snapshot.execute_sql("SELECT Count(*) from bsos")
return result.one()[0]
def dump_data(args, schema):
offset = 0
# things time out around 1_500_000 rows.
db = conf_spanner(args)
writer = DataFileWriter(
open(args.output, "wb"), DatumWriter(), schema)
row_count = count_rows(db)
print("Dumping {} rows".format(row_count))
while offset < row_count:
old_offset = offset
offset = dump_rows(offset=offset, db=db, writer=writer, args=args)
if offset == old_offset:
break
writer.close()
return row_count
def main():
start = time.time()
args = get_args()
schema = avro.schema.parse(open(args.schema, "rb").read())
rows = dump_data(args, schema)
print("Dumped: {} rows in {} seconds".format(rows, time.time() - start))
if __name__ == "__main__":
main()