Add a Citus distribution test case, from the citus tutorial.

2025-08-07 23:07:00 +02:00 · 2018-10-18 15:42:17 +02:00 · 2018-10-18 15:42:17 +02:00 · 7b487ddaca
commit 7b487ddaca
parent d3b21ac54d
7 changed files with 199 additions and 0 deletions
--- a/test/citus/.gitignore
+++ b/test/citus/.gitignore
@ -0,0 +1 @@
+*.csv
--- a/test/citus/Makefile
+++ b/test/citus/Makefile
@ -0,0 +1,20 @@
+DATASET = companies campaigns ads clicks impressions geo_ips
+CSV     = $(addsuffix .csv,$(DATASET))
+DROP = DROP TABLE IF EXISTS companies, campaigns, ads, clicks, impressions, geo_ips
+
+all: schema data ;
+
+schema:
+	psql --single-transaction -c "$(DROP)" -d hackathon
+	psql --single-transaction -f company.sql -d hackathon
+
+data: fetch
+	psql -f copy.sql -d hackathon
+	../../build/bin/pgloader ./data.load
+
+fetch: $(CSV) ;
+
+%.csv:
+	curl -O https://examples.citusdata.com/mt_ref_arch/$@
+
+.PHONY: schema data fetch
--- a/test/citus/README.md
+++ b/test/citus/README.md
@ -0,0 +1,42 @@
+# Citus Multi-Tenant Automatic Distribution
+
+In this test case we follow the following documentation:
+
+  https://docs.citusdata.com/en/v7.5/use_cases/multi_tenant.html
+  
+We install the schema before Citus migration, and load the data without the
+backfilling that is already done. For that we use pgloader to ignore the
+company_id column in the tables that didn't have this column prior to the
+Citus migration effort.
+
+Then the following `company.load` file contains the pgloader command that
+runs a full migration from PostgreSQL to Citus:
+
+```
+load database
+   from pgsql:///hackathon
+   into pgsql://localhost:9700/dim
+
+   with include drop, reset no sequences
+
+   distribute companies using id;
+```
+
+Tables are marked distributed, the company_id column is added where it's
+needed, primary keys and foreign keys definitions are altered to the new
+model, and finally the data is backfilled automatically in the target table
+thanks to generating queries like the following:
+
+~~~
+SELECT "campaigns".company_id::text,
+       "impressions".id::text,
+       "impressions".ad_id::text,
+       "impressions".seen_at::text,
+       "impressions".site_url::text,
+       "impressions".cost_per_impression_usd::text,
+       "impressions".user_ip::text,
+       "impressions".user_data::text
+  FROM "public"."impressions"  
+        JOIN "public"."ads" ON impressions.ad_id = ads.id
+        JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id
+~~~
--- a/test/citus/company.load
+++ b/test/citus/company.load
@ -0,0 +1,12 @@
+load database
+   from pgsql:///hackathon
+   into pgsql://localhost:9700/dim
+
+   with include drop, reset no sequences
+
+   distribute companies using id
+   -- distribute campaigns using company_id
+   -- distribute ads using company_id from campaigns
+   -- distribute clicks using company_id from ads, campaigns
+   -- distribute impressions using company_id from ads, campaigns
+   ;
--- a/test/citus/company.sql
+++ b/test/citus/company.sql
@ -0,0 +1,51 @@
+CREATE TABLE companies (
+  id bigserial PRIMARY KEY,
+  name text NOT NULL,
+  image_url text,
+  created_at timestamp without time zone NOT NULL,
+  updated_at timestamp without time zone NOT NULL
+);
+
+CREATE TABLE campaigns (
+  id bigserial PRIMARY KEY,
+  company_id bigint REFERENCES companies (id),
+  name text NOT NULL,
+  cost_model text NOT NULL,
+  state text NOT NULL,
+  monthly_budget bigint,
+  blacklisted_site_urls text[],
+  created_at timestamp without time zone NOT NULL,
+  updated_at timestamp without time zone NOT NULL
+);
+
+CREATE TABLE ads (
+  id bigserial PRIMARY KEY,
+  campaign_id bigint REFERENCES campaigns (id),
+  name text NOT NULL,
+  image_url text,
+  target_url text,
+  impressions_count bigint DEFAULT 0,
+  clicks_count bigint DEFAULT 0,
+  created_at timestamp without time zone NOT NULL,
+  updated_at timestamp without time zone NOT NULL
+);
+
+CREATE TABLE clicks (
+  id bigserial PRIMARY KEY,
+  ad_id bigint REFERENCES ads (id),
+  clicked_at timestamp without time zone NOT NULL,
+  site_url text NOT NULL,
+  cost_per_click_usd numeric(20,10),
+  user_ip inet NOT NULL,
+  user_data jsonb NOT NULL
+);
+
+CREATE TABLE impressions (
+  id bigserial PRIMARY KEY,
+  ad_id bigint REFERENCES ads (id),
+  seen_at timestamp without time zone NOT NULL,
+  site_url text NOT NULL,
+  cost_per_impression_usd numeric(20,10),
+  user_ip inet NOT NULL,
+  user_data jsonb NOT NULL
+);
--- a/test/citus/copy.sql
+++ b/test/citus/copy.sql
@ -0,0 +1,5 @@
+\copy companies from 'companies.csv' with csv
+\copy campaigns from 'campaigns.csv' with csv
+-- \copy ads from 'ads.csv' with csv
+-- \copy clicks from 'clicks.csv' with csv
+-- \copy impressions from 'impressions.csv' with csv
--- a/test/citus/data.load
+++ b/test/citus/data.load
@ -0,0 +1,68 @@
+--
+-- Ads
+--
+load csv
+  from ads.csv
+  (
+    id, company_id, campaign_id, name, image_url, target_url,
+    impressions_count, clicks_count, created_at, updated_at
+  )
+  
+  into postgresql:///hackathon
+  
+  target table ads
+  target columns
+  (
+    id, campaign_id, name, image_url, target_url,
+    impressions_count, clicks_count, created_at, updated_at
+  )
+
+  with fields optionally enclosed by '"',
+       fields escaped by double-quote,
+       fields terminated by ',';
+
+--
+-- Clicks
+--
+load csv
+  from clicks.csv
+  (
+   id, company_id, ad_id, clicked_at, site_url, cost_per_click_usd, 
+   user_ip, user_data
+  )
+  
+  into postgresql:///hackathon
+  
+  target table clicks
+  target columns
+  (
+   id, ad_id, clicked_at, site_url, cost_per_click_usd, user_ip, user_data
+  )
+
+  with fields optionally enclosed by '"',
+       fields escaped by double-quote,
+       fields terminated by ',';
+
+
+--
+-- Impressions
+--
+load csv
+  from impressions.csv
+  (
+    id, company_id, ad_id, seen_at, site_url,
+    cost_per_impression_usd, user_ip, user_data
+  )
+  
+  into postgresql:///hackathon
+  
+  target table impressions
+  target columns
+  (
+    id, ad_id, seen_at, site_url, cost_per_impression_usd, user_ip, user_data
+  )
+
+  with drop indexes,
+       fields optionally enclosed by '"',
+       fields escaped by double-quote,
+       fields terminated by ',';