From 7b487ddacaf38a96acac29c29e815f0d608b59b5 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Thu, 18 Oct 2018 15:42:17 +0200 Subject: [PATCH] Add a Citus distribution test case, from the citus tutorial. --- test/citus/.gitignore | 1 + test/citus/Makefile | 20 ++++++++++++ test/citus/README.md | 42 +++++++++++++++++++++++++ test/citus/company.load | 12 ++++++++ test/citus/company.sql | 51 +++++++++++++++++++++++++++++++ test/citus/copy.sql | 5 +++ test/citus/data.load | 68 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 199 insertions(+) create mode 100644 test/citus/.gitignore create mode 100644 test/citus/Makefile create mode 100644 test/citus/README.md create mode 100644 test/citus/company.load create mode 100644 test/citus/company.sql create mode 100644 test/citus/copy.sql create mode 100644 test/citus/data.load diff --git a/test/citus/.gitignore b/test/citus/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/test/citus/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/test/citus/Makefile b/test/citus/Makefile new file mode 100644 index 0000000..0c4c5b9 --- /dev/null +++ b/test/citus/Makefile @@ -0,0 +1,20 @@ +DATASET = companies campaigns ads clicks impressions geo_ips +CSV = $(addsuffix .csv,$(DATASET)) +DROP = DROP TABLE IF EXISTS companies, campaigns, ads, clicks, impressions, geo_ips + +all: schema data ; + +schema: + psql --single-transaction -c "$(DROP)" -d hackathon + psql --single-transaction -f company.sql -d hackathon + +data: fetch + psql -f copy.sql -d hackathon + ../../build/bin/pgloader ./data.load + +fetch: $(CSV) ; + +%.csv: + curl -O https://examples.citusdata.com/mt_ref_arch/$@ + +.PHONY: schema data fetch diff --git a/test/citus/README.md b/test/citus/README.md new file mode 100644 index 0000000..499ecd6 --- /dev/null +++ b/test/citus/README.md @@ -0,0 +1,42 @@ +# Citus Multi-Tenant Automatic Distribution + +In this test case we follow the following documentation: + + https://docs.citusdata.com/en/v7.5/use_cases/multi_tenant.html + +We install the schema before Citus migration, and load the data without the +backfilling that is already done. For that we use pgloader to ignore the +company_id column in the tables that didn't have this column prior to the +Citus migration effort. + +Then the following `company.load` file contains the pgloader command that +runs a full migration from PostgreSQL to Citus: + +``` +load database + from pgsql:///hackathon + into pgsql://localhost:9700/dim + + with include drop, reset no sequences + + distribute companies using id; +``` + +Tables are marked distributed, the company_id column is added where it's +needed, primary keys and foreign keys definitions are altered to the new +model, and finally the data is backfilled automatically in the target table +thanks to generating queries like the following: + +~~~ +SELECT "campaigns".company_id::text, + "impressions".id::text, + "impressions".ad_id::text, + "impressions".seen_at::text, + "impressions".site_url::text, + "impressions".cost_per_impression_usd::text, + "impressions".user_ip::text, + "impressions".user_data::text + FROM "public"."impressions" + JOIN "public"."ads" ON impressions.ad_id = ads.id + JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id +~~~ diff --git a/test/citus/company.load b/test/citus/company.load new file mode 100644 index 0000000..ef4af21 --- /dev/null +++ b/test/citus/company.load @@ -0,0 +1,12 @@ +load database + from pgsql:///hackathon + into pgsql://localhost:9700/dim + + with include drop, reset no sequences + + distribute companies using id + -- distribute campaigns using company_id + -- distribute ads using company_id from campaigns + -- distribute clicks using company_id from ads, campaigns + -- distribute impressions using company_id from ads, campaigns + ; diff --git a/test/citus/company.sql b/test/citus/company.sql new file mode 100644 index 0000000..dad23dc --- /dev/null +++ b/test/citus/company.sql @@ -0,0 +1,51 @@ +CREATE TABLE companies ( + id bigserial PRIMARY KEY, + name text NOT NULL, + image_url text, + created_at timestamp without time zone NOT NULL, + updated_at timestamp without time zone NOT NULL +); + +CREATE TABLE campaigns ( + id bigserial PRIMARY KEY, + company_id bigint REFERENCES companies (id), + name text NOT NULL, + cost_model text NOT NULL, + state text NOT NULL, + monthly_budget bigint, + blacklisted_site_urls text[], + created_at timestamp without time zone NOT NULL, + updated_at timestamp without time zone NOT NULL +); + +CREATE TABLE ads ( + id bigserial PRIMARY KEY, + campaign_id bigint REFERENCES campaigns (id), + name text NOT NULL, + image_url text, + target_url text, + impressions_count bigint DEFAULT 0, + clicks_count bigint DEFAULT 0, + created_at timestamp without time zone NOT NULL, + updated_at timestamp without time zone NOT NULL +); + +CREATE TABLE clicks ( + id bigserial PRIMARY KEY, + ad_id bigint REFERENCES ads (id), + clicked_at timestamp without time zone NOT NULL, + site_url text NOT NULL, + cost_per_click_usd numeric(20,10), + user_ip inet NOT NULL, + user_data jsonb NOT NULL +); + +CREATE TABLE impressions ( + id bigserial PRIMARY KEY, + ad_id bigint REFERENCES ads (id), + seen_at timestamp without time zone NOT NULL, + site_url text NOT NULL, + cost_per_impression_usd numeric(20,10), + user_ip inet NOT NULL, + user_data jsonb NOT NULL +); diff --git a/test/citus/copy.sql b/test/citus/copy.sql new file mode 100644 index 0000000..684f891 --- /dev/null +++ b/test/citus/copy.sql @@ -0,0 +1,5 @@ +\copy companies from 'companies.csv' with csv +\copy campaigns from 'campaigns.csv' with csv +-- \copy ads from 'ads.csv' with csv +-- \copy clicks from 'clicks.csv' with csv +-- \copy impressions from 'impressions.csv' with csv diff --git a/test/citus/data.load b/test/citus/data.load new file mode 100644 index 0000000..cbb29b0 --- /dev/null +++ b/test/citus/data.load @@ -0,0 +1,68 @@ +-- +-- Ads +-- +load csv + from ads.csv + ( + id, company_id, campaign_id, name, image_url, target_url, + impressions_count, clicks_count, created_at, updated_at + ) + + into postgresql:///hackathon + + target table ads + target columns + ( + id, campaign_id, name, image_url, target_url, + impressions_count, clicks_count, created_at, updated_at + ) + + with fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ','; + +-- +-- Clicks +-- +load csv + from clicks.csv + ( + id, company_id, ad_id, clicked_at, site_url, cost_per_click_usd, + user_ip, user_data + ) + + into postgresql:///hackathon + + target table clicks + target columns + ( + id, ad_id, clicked_at, site_url, cost_per_click_usd, user_ip, user_data + ) + + with fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ','; + + +-- +-- Impressions +-- +load csv + from impressions.csv + ( + id, company_id, ad_id, seen_at, site_url, + cost_per_impression_usd, user_ip, user_data + ) + + into postgresql:///hackathon + + target table impressions + target columns + ( + id, ad_id, seen_at, site_url, cost_per_impression_usd, user_ip, user_data + ) + + with drop indexes, + fields optionally enclosed by '"', + fields escaped by double-quote, + fields terminated by ','; \ No newline at end of file