From 7b487ddacaf38a96acac29c29e815f0d608b59b5 Mon Sep 17 00:00:00 2001
From: Dimitri Fontaine <dim@tapoueh.org>
Date: Thu, 18 Oct 2018 15:42:17 +0200
Subject: [PATCH] Add a Citus distribution test case, from the citus tutorial.

---
 test/citus/.gitignore   |  1 +
 test/citus/Makefile     | 20 ++++++++++++
 test/citus/README.md    | 42 +++++++++++++++++++++++++
 test/citus/company.load | 12 ++++++++
 test/citus/company.sql  | 51 +++++++++++++++++++++++++++++++
 test/citus/copy.sql     |  5 +++
 test/citus/data.load    | 68 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 199 insertions(+)
 create mode 100644 test/citus/.gitignore
 create mode 100644 test/citus/Makefile
 create mode 100644 test/citus/README.md
 create mode 100644 test/citus/company.load
 create mode 100644 test/citus/company.sql
 create mode 100644 test/citus/copy.sql
 create mode 100644 test/citus/data.load

diff --git a/test/citus/.gitignore b/test/citus/.gitignore
new file mode 100644
index 0000000..16f2dc5
--- /dev/null
+++ b/test/citus/.gitignore
@@ -0,0 +1 @@
+*.csv
\ No newline at end of file
diff --git a/test/citus/Makefile b/test/citus/Makefile
new file mode 100644
index 0000000..0c4c5b9
--- /dev/null
+++ b/test/citus/Makefile
@@ -0,0 +1,20 @@
+DATASET = companies campaigns ads clicks impressions geo_ips
+CSV     = $(addsuffix .csv,$(DATASET))
+DROP = DROP TABLE IF EXISTS companies, campaigns, ads, clicks, impressions, geo_ips
+
+all: schema data ;
+
+schema:
+	psql --single-transaction -c "$(DROP)" -d hackathon
+	psql --single-transaction -f company.sql -d hackathon
+
+data: fetch
+	psql -f copy.sql -d hackathon
+	../../build/bin/pgloader ./data.load
+
+fetch: $(CSV) ;
+
+%.csv:
+	curl -O https://examples.citusdata.com/mt_ref_arch/$@
+
+.PHONY: schema data fetch
diff --git a/test/citus/README.md b/test/citus/README.md
new file mode 100644
index 0000000..499ecd6
--- /dev/null
+++ b/test/citus/README.md
@@ -0,0 +1,42 @@
+# Citus Multi-Tenant Automatic Distribution
+
+In this test case we follow the following documentation:
+
+  https://docs.citusdata.com/en/v7.5/use_cases/multi_tenant.html
+  
+We install the schema before Citus migration, and load the data without the
+backfilling that is already done. For that we use pgloader to ignore the
+company_id column in the tables that didn't have this column prior to the
+Citus migration effort.
+
+Then the following `company.load` file contains the pgloader command that
+runs a full migration from PostgreSQL to Citus:
+
+```
+load database
+   from pgsql:///hackathon
+   into pgsql://localhost:9700/dim
+
+   with include drop, reset no sequences
+
+   distribute companies using id;
+```
+
+Tables are marked distributed, the company_id column is added where it's
+needed, primary keys and foreign keys definitions are altered to the new
+model, and finally the data is backfilled automatically in the target table
+thanks to generating queries like the following:
+
+~~~
+SELECT "campaigns".company_id::text,
+       "impressions".id::text,
+       "impressions".ad_id::text,
+       "impressions".seen_at::text,
+       "impressions".site_url::text,
+       "impressions".cost_per_impression_usd::text,
+       "impressions".user_ip::text,
+       "impressions".user_data::text
+  FROM "public"."impressions"  
+        JOIN "public"."ads" ON impressions.ad_id = ads.id
+        JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id
+~~~
diff --git a/test/citus/company.load b/test/citus/company.load
new file mode 100644
index 0000000..ef4af21
--- /dev/null
+++ b/test/citus/company.load
@@ -0,0 +1,12 @@
+load database
+   from pgsql:///hackathon
+   into pgsql://localhost:9700/dim
+
+   with include drop, reset no sequences
+
+   distribute companies using id
+   -- distribute campaigns using company_id
+   -- distribute ads using company_id from campaigns
+   -- distribute clicks using company_id from ads, campaigns
+   -- distribute impressions using company_id from ads, campaigns
+   ;
diff --git a/test/citus/company.sql b/test/citus/company.sql
new file mode 100644
index 0000000..dad23dc
--- /dev/null
+++ b/test/citus/company.sql
@@ -0,0 +1,51 @@
+CREATE TABLE companies (
+  id bigserial PRIMARY KEY,
+  name text NOT NULL,
+  image_url text,
+  created_at timestamp without time zone NOT NULL,
+  updated_at timestamp without time zone NOT NULL
+);
+
+CREATE TABLE campaigns (
+  id bigserial PRIMARY KEY,
+  company_id bigint REFERENCES companies (id),
+  name text NOT NULL,
+  cost_model text NOT NULL,
+  state text NOT NULL,
+  monthly_budget bigint,
+  blacklisted_site_urls text[],
+  created_at timestamp without time zone NOT NULL,
+  updated_at timestamp without time zone NOT NULL
+);
+
+CREATE TABLE ads (
+  id bigserial PRIMARY KEY,
+  campaign_id bigint REFERENCES campaigns (id),
+  name text NOT NULL,
+  image_url text,
+  target_url text,
+  impressions_count bigint DEFAULT 0,
+  clicks_count bigint DEFAULT 0,
+  created_at timestamp without time zone NOT NULL,
+  updated_at timestamp without time zone NOT NULL
+);
+
+CREATE TABLE clicks (
+  id bigserial PRIMARY KEY,
+  ad_id bigint REFERENCES ads (id),
+  clicked_at timestamp without time zone NOT NULL,
+  site_url text NOT NULL,
+  cost_per_click_usd numeric(20,10),
+  user_ip inet NOT NULL,
+  user_data jsonb NOT NULL
+);
+
+CREATE TABLE impressions (
+  id bigserial PRIMARY KEY,
+  ad_id bigint REFERENCES ads (id),
+  seen_at timestamp without time zone NOT NULL,
+  site_url text NOT NULL,
+  cost_per_impression_usd numeric(20,10),
+  user_ip inet NOT NULL,
+  user_data jsonb NOT NULL
+);
diff --git a/test/citus/copy.sql b/test/citus/copy.sql
new file mode 100644
index 0000000..684f891
--- /dev/null
+++ b/test/citus/copy.sql
@@ -0,0 +1,5 @@
+\copy companies from 'companies.csv' with csv
+\copy campaigns from 'campaigns.csv' with csv
+-- \copy ads from 'ads.csv' with csv
+-- \copy clicks from 'clicks.csv' with csv
+-- \copy impressions from 'impressions.csv' with csv
diff --git a/test/citus/data.load b/test/citus/data.load
new file mode 100644
index 0000000..cbb29b0
--- /dev/null
+++ b/test/citus/data.load
@@ -0,0 +1,68 @@
+--
+-- Ads
+--
+load csv
+  from ads.csv
+  (
+    id, company_id, campaign_id, name, image_url, target_url,
+    impressions_count, clicks_count, created_at, updated_at
+  )
+  
+  into postgresql:///hackathon
+  
+  target table ads
+  target columns
+  (
+    id, campaign_id, name, image_url, target_url,
+    impressions_count, clicks_count, created_at, updated_at
+  )
+
+  with fields optionally enclosed by '"',
+       fields escaped by double-quote,
+       fields terminated by ',';
+
+--
+-- Clicks
+--
+load csv
+  from clicks.csv
+  (
+   id, company_id, ad_id, clicked_at, site_url, cost_per_click_usd, 
+   user_ip, user_data
+  )
+  
+  into postgresql:///hackathon
+  
+  target table clicks
+  target columns
+  (
+   id, ad_id, clicked_at, site_url, cost_per_click_usd, user_ip, user_data
+  )
+
+  with fields optionally enclosed by '"',
+       fields escaped by double-quote,
+       fields terminated by ',';
+
+
+--
+-- Impressions
+--
+load csv
+  from impressions.csv
+  (
+    id, company_id, ad_id, seen_at, site_url,
+    cost_per_impression_usd, user_ip, user_data
+  )
+  
+  into postgresql:///hackathon
+  
+  target table impressions
+  target columns
+  (
+    id, ad_id, seen_at, site_url, cost_per_impression_usd, user_ip, user_data
+  )
+
+  with drop indexes,
+       fields optionally enclosed by '"',
+       fields escaped by double-quote,
+       fields terminated by ',';
\ No newline at end of file