Add a Citus distribution test case, from the citus tutorial.

This commit is contained in:
Dimitri Fontaine 2018-10-18 15:42:17 +02:00
parent d3b21ac54d
commit 7b487ddaca
7 changed files with 199 additions and 0 deletions

1
test/citus/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.csv

20
test/citus/Makefile Normal file
View File

@ -0,0 +1,20 @@
DATASET = companies campaigns ads clicks impressions geo_ips
CSV = $(addsuffix .csv,$(DATASET))
DROP = DROP TABLE IF EXISTS companies, campaigns, ads, clicks, impressions, geo_ips
all: schema data ;
schema:
psql --single-transaction -c "$(DROP)" -d hackathon
psql --single-transaction -f company.sql -d hackathon
data: fetch
psql -f copy.sql -d hackathon
../../build/bin/pgloader ./data.load
fetch: $(CSV) ;
%.csv:
curl -O https://examples.citusdata.com/mt_ref_arch/$@
.PHONY: schema data fetch

42
test/citus/README.md Normal file
View File

@ -0,0 +1,42 @@
# Citus Multi-Tenant Automatic Distribution
In this test case we follow the following documentation:
https://docs.citusdata.com/en/v7.5/use_cases/multi_tenant.html
We install the schema before Citus migration, and load the data without the
backfilling that is already done. For that we use pgloader to ignore the
company_id column in the tables that didn't have this column prior to the
Citus migration effort.
Then the following `company.load` file contains the pgloader command that
runs a full migration from PostgreSQL to Citus:
```
load database
from pgsql:///hackathon
into pgsql://localhost:9700/dim
with include drop, reset no sequences
distribute companies using id;
```
Tables are marked distributed, the company_id column is added where it's
needed, primary keys and foreign keys definitions are altered to the new
model, and finally the data is backfilled automatically in the target table
thanks to generating queries like the following:
~~~
SELECT "campaigns".company_id::text,
"impressions".id::text,
"impressions".ad_id::text,
"impressions".seen_at::text,
"impressions".site_url::text,
"impressions".cost_per_impression_usd::text,
"impressions".user_ip::text,
"impressions".user_data::text
FROM "public"."impressions"
JOIN "public"."ads" ON impressions.ad_id = ads.id
JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id
~~~

12
test/citus/company.load Normal file
View File

@ -0,0 +1,12 @@
load database
from pgsql:///hackathon
into pgsql://localhost:9700/dim
with include drop, reset no sequences
distribute companies using id
-- distribute campaigns using company_id
-- distribute ads using company_id from campaigns
-- distribute clicks using company_id from ads, campaigns
-- distribute impressions using company_id from ads, campaigns
;

51
test/citus/company.sql vendored Normal file
View File

@ -0,0 +1,51 @@
CREATE TABLE companies (
id bigserial PRIMARY KEY,
name text NOT NULL,
image_url text,
created_at timestamp without time zone NOT NULL,
updated_at timestamp without time zone NOT NULL
);
CREATE TABLE campaigns (
id bigserial PRIMARY KEY,
company_id bigint REFERENCES companies (id),
name text NOT NULL,
cost_model text NOT NULL,
state text NOT NULL,
monthly_budget bigint,
blacklisted_site_urls text[],
created_at timestamp without time zone NOT NULL,
updated_at timestamp without time zone NOT NULL
);
CREATE TABLE ads (
id bigserial PRIMARY KEY,
campaign_id bigint REFERENCES campaigns (id),
name text NOT NULL,
image_url text,
target_url text,
impressions_count bigint DEFAULT 0,
clicks_count bigint DEFAULT 0,
created_at timestamp without time zone NOT NULL,
updated_at timestamp without time zone NOT NULL
);
CREATE TABLE clicks (
id bigserial PRIMARY KEY,
ad_id bigint REFERENCES ads (id),
clicked_at timestamp without time zone NOT NULL,
site_url text NOT NULL,
cost_per_click_usd numeric(20,10),
user_ip inet NOT NULL,
user_data jsonb NOT NULL
);
CREATE TABLE impressions (
id bigserial PRIMARY KEY,
ad_id bigint REFERENCES ads (id),
seen_at timestamp without time zone NOT NULL,
site_url text NOT NULL,
cost_per_impression_usd numeric(20,10),
user_ip inet NOT NULL,
user_data jsonb NOT NULL
);

5
test/citus/copy.sql vendored Normal file
View File

@ -0,0 +1,5 @@
\copy companies from 'companies.csv' with csv
\copy campaigns from 'campaigns.csv' with csv
-- \copy ads from 'ads.csv' with csv
-- \copy clicks from 'clicks.csv' with csv
-- \copy impressions from 'impressions.csv' with csv

68
test/citus/data.load Normal file
View File

@ -0,0 +1,68 @@
--
-- Ads
--
load csv
from ads.csv
(
id, company_id, campaign_id, name, image_url, target_url,
impressions_count, clicks_count, created_at, updated_at
)
into postgresql:///hackathon
target table ads
target columns
(
id, campaign_id, name, image_url, target_url,
impressions_count, clicks_count, created_at, updated_at
)
with fields optionally enclosed by '"',
fields escaped by double-quote,
fields terminated by ',';
--
-- Clicks
--
load csv
from clicks.csv
(
id, company_id, ad_id, clicked_at, site_url, cost_per_click_usd,
user_ip, user_data
)
into postgresql:///hackathon
target table clicks
target columns
(
id, ad_id, clicked_at, site_url, cost_per_click_usd, user_ip, user_data
)
with fields optionally enclosed by '"',
fields escaped by double-quote,
fields terminated by ',';
--
-- Impressions
--
load csv
from impressions.csv
(
id, company_id, ad_id, seen_at, site_url,
cost_per_impression_usd, user_ip, user_data
)
into postgresql:///hackathon
target table impressions
target columns
(
id, ad_id, seen_at, site_url, cost_per_impression_usd, user_ip, user_data
)
with drop indexes,
fields optionally enclosed by '"',
fields escaped by double-quote,
fields terminated by ',';