mirror of
https://github.com/mozilla-services/syncstorage-rs.git
synced 2025-08-06 20:06:57 +02:00
* feat: more user_migration stuff * create script to move users by node directly * moved old scripts to `old` directory (for historic reasons, as well as possible future use) * cleaned up README * try to solve the `parent row` error an intermittent error may be responsible from one of two things: 1) a transaction failure resulted in a premature add of the unique key to the UC filter. 2) an internal spanner update error resulting from trying to write the bso before the user_collection row was written. * Added "fix_collections.sql" script to update collections table to add well known collections for future rectification. * returned collection name lookup * add "--user" arg to set bso and user id * add `--dryrun` mode
112 lines
3.4 KiB
Python
112 lines
3.4 KiB
Python
#! venv/bin/python
|
|
|
|
# painfully stupid script to check out dumping a spanner database to avro.
|
|
# Avro is basically "JSON" for databases. It's not super complicated & it has
|
|
# issues (one of which is that it requires Python2).
|
|
# test run Dumped 2770783 rows in 457.566066027 seconds and produced a
|
|
# roughly 6.5GB file.
|
|
#
|
|
# Spanner also has a Deadline issue where it will kill a db connection after
|
|
# so many minutes (5?). Might be better to just divvy things up into clusters
|
|
# and have threads handle transporting records over.
|
|
#
|
|
|
|
import avro.schema
|
|
import argparse
|
|
import time
|
|
|
|
from avro.datafile import DataFileWriter
|
|
from avro.io import DatumWriter
|
|
from google.cloud import spanner
|
|
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser(description="dump spanner to arvo files")
|
|
parser.add_argument(
|
|
'--instance_id', default="spanner-test",
|
|
help="Spanner instance name")
|
|
parser.add_argument(
|
|
'--database_id', default="sync_schema3",
|
|
help="Spanner database name")
|
|
parser.add_argument(
|
|
'--schema', default="sync.avsc",
|
|
help="Database schema description")
|
|
parser.add_argument(
|
|
'--output', default="output.avso",
|
|
help="Output file")
|
|
parser.add_argument(
|
|
'--limit', type=int, default=1500000,
|
|
help="Limit to n rows")
|
|
return parser.parse_args()
|
|
|
|
|
|
def conf_spanner(args):
|
|
spanner_client = spanner.Client()
|
|
instance = spanner_client.instance(args.instance_id)
|
|
database = instance.database(args.database_id)
|
|
return database
|
|
|
|
|
|
def dump_rows(offset, db, writer, args):
|
|
print("Querying.... @{}".format(offset))
|
|
sql = """
|
|
SELECT collection_id, fxa_kid, fxa_uid, bso_id,
|
|
UNIX_MICROS(expiry), UNIX_MICROS(modified), payload,
|
|
sortindex from bsos LIMIT {} OFFSET {}""".format(args.limit, offset)
|
|
try:
|
|
with db.snapshot() as snapshot:
|
|
result = snapshot.execute_sql(sql)
|
|
print("Dumping...")
|
|
for row in result:
|
|
writer.append({
|
|
"collection_id": row[0],
|
|
"fxa_kid": row[1],
|
|
"fxa_uid": row[2],
|
|
"bso_id": row[3],
|
|
"expiry": row[4],
|
|
"modified": row[5],
|
|
"payload": row[6],
|
|
"sortindex": row[7]})
|
|
offset += 1
|
|
if offset % 1000 == 0:
|
|
print("Row: {}".format(offset))
|
|
return offset
|
|
except Exception as ex:
|
|
print("Deadline hit at: {} ({})".format(offset, ex))
|
|
return offset
|
|
|
|
|
|
def count_rows(db):
|
|
with db.snapshot() as snapshot:
|
|
result = snapshot.execute_sql("SELECT Count(*) from bsos")
|
|
return result.one()[0]
|
|
|
|
|
|
def dump_data(args, schema):
|
|
offset = 0
|
|
# things time out around 1_500_000 rows.
|
|
db = conf_spanner(args)
|
|
writer = DataFileWriter(
|
|
open(args.output, "wb"), DatumWriter(), schema)
|
|
row_count = count_rows(db)
|
|
print("Dumping {} rows".format(row_count))
|
|
while offset < row_count:
|
|
old_offset = offset
|
|
offset = dump_rows(offset=offset, db=db, writer=writer, args=args)
|
|
if offset == old_offset:
|
|
break
|
|
writer.close()
|
|
return row_count
|
|
|
|
|
|
def main():
|
|
start = time.time()
|
|
args = get_args()
|
|
schema = avro.schema.parse(open(args.schema, "rb").read())
|
|
rows = dump_data(args, schema)
|
|
print("Dumped: {} rows in {} seconds".format(rows, time.time() - start))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|