Add a Citus distribution test case, from the citus tutorial.

2025-08-08 07:16:58 +02:00 · 2018-10-18 15:42:17 +02:00 · 2018-10-18 15:42:17 +02:00 · 7b487ddaca
commit 7b487ddaca
parent d3b21ac54d
7 changed files with 199 additions and 0 deletions
--- a/test/citus/.gitignore
+++ b/test/citus/.gitignore
@ -0,0 +1 @@
 *.csv
--- a/test/citus/Makefile
+++ b/test/citus/Makefile
@ -0,0 +1,20 @@
 DATASET = companies campaigns ads clicks impressions geo_ips
 CSV     = $(addsuffix .csv,$(DATASET))
 DROP = DROP TABLE IF EXISTS companies, campaigns, ads, clicks, impressions, geo_ips
 all: schema data ;
 schema:
 	psql --single-transaction -c "$(DROP)" -d hackathon
 	psql --single-transaction -f company.sql -d hackathon
 data: fetch
 	psql -f copy.sql -d hackathon
 	../../build/bin/pgloader ./data.load
 fetch: $(CSV) ;
 %.csv:
 	curl -O https://examples.citusdata.com/mt_ref_arch/$@
 .PHONY: schema data fetch
--- a/test/citus/README.md
+++ b/test/citus/README.md
@ -0,0 +1,42 @@
 # Citus Multi-Tenant Automatic Distribution
 In this test case we follow the following documentation:
  https://docs.citusdata.com/en/v7.5/use_cases/multi_tenant.html
 We install the schema before Citus migration, and load the data without the
 backfilling that is already done. For that we use pgloader to ignore the
 company_id column in the tables that didn't have this column prior to the
 Citus migration effort.
 Then the following `company.load` file contains the pgloader command that
 runs a full migration from PostgreSQL to Citus:
 ```
 load database
   from pgsql:///hackathon
   into pgsql://localhost:9700/dim
   with include drop, reset no sequences
   distribute companies using id;
 ```
 Tables are marked distributed, the company_id column is added where it's
 needed, primary keys and foreign keys definitions are altered to the new
 model, and finally the data is backfilled automatically in the target table
 thanks to generating queries like the following:
 ~~~
 SELECT "campaigns".company_id::text,
       "impressions".id::text,
       "impressions".ad_id::text,
       "impressions".seen_at::text,
       "impressions".site_url::text,
       "impressions".cost_per_impression_usd::text,
       "impressions".user_ip::text,
       "impressions".user_data::text
  FROM "public"."impressions"  
        JOIN "public"."ads" ON impressions.ad_id = ads.id
        JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id
 ~~~
--- a/test/citus/company.load
+++ b/test/citus/company.load
@ -0,0 +1,12 @@
 load database
   from pgsql:///hackathon
   into pgsql://localhost:9700/dim
   with include drop, reset no sequences
   distribute companies using id
   -- distribute campaigns using company_id
   -- distribute ads using company_id from campaigns
   -- distribute clicks using company_id from ads, campaigns
   -- distribute impressions using company_id from ads, campaigns
   ;
--- a/test/citus/company.sql
+++ b/test/citus/company.sql
@ -0,0 +1,51 @@
 CREATE TABLE companies (
  id bigserial PRIMARY KEY,
  name text NOT NULL,
  image_url text,
  created_at timestamp without time zone NOT NULL,
  updated_at timestamp without time zone NOT NULL
 );
 CREATE TABLE campaigns (
  id bigserial PRIMARY KEY,
  company_id bigint REFERENCES companies (id),
  name text NOT NULL,
  cost_model text NOT NULL,
  state text NOT NULL,
  monthly_budget bigint,
  blacklisted_site_urls text[],
  created_at timestamp without time zone NOT NULL,
  updated_at timestamp without time zone NOT NULL
 );
 CREATE TABLE ads (
  id bigserial PRIMARY KEY,
  campaign_id bigint REFERENCES campaigns (id),
  name text NOT NULL,
  image_url text,
  target_url text,
  impressions_count bigint DEFAULT 0,
  clicks_count bigint DEFAULT 0,
  created_at timestamp without time zone NOT NULL,
  updated_at timestamp without time zone NOT NULL
 );
 CREATE TABLE clicks (
  id bigserial PRIMARY KEY,
  ad_id bigint REFERENCES ads (id),
  clicked_at timestamp without time zone NOT NULL,
  site_url text NOT NULL,
  cost_per_click_usd numeric(20,10),
  user_ip inet NOT NULL,
  user_data jsonb NOT NULL
 );
 CREATE TABLE impressions (
  id bigserial PRIMARY KEY,
  ad_id bigint REFERENCES ads (id),
  seen_at timestamp without time zone NOT NULL,
  site_url text NOT NULL,
  cost_per_impression_usd numeric(20,10),
  user_ip inet NOT NULL,
  user_data jsonb NOT NULL
 );
--- a/test/citus/copy.sql
+++ b/test/citus/copy.sql
@ -0,0 +1,5 @@
 \copy companies from 'companies.csv' with csv
 \copy campaigns from 'campaigns.csv' with csv
 -- \copy ads from 'ads.csv' with csv
 -- \copy clicks from 'clicks.csv' with csv
 -- \copy impressions from 'impressions.csv' with csv
--- a/test/citus/data.load
+++ b/test/citus/data.load
@ -0,0 +1,68 @@
 --
 -- Ads
 --
 load csv
  from ads.csv
  (
    id, company_id, campaign_id, name, image_url, target_url,
    impressions_count, clicks_count, created_at, updated_at
  )
  into postgresql:///hackathon
  target table ads
  target columns
  (
    id, campaign_id, name, image_url, target_url,
    impressions_count, clicks_count, created_at, updated_at
  )
  with fields optionally enclosed by '"',
       fields escaped by double-quote,
       fields terminated by ',';
 --
 -- Clicks
 --
 load csv
  from clicks.csv
  (
   id, company_id, ad_id, clicked_at, site_url, cost_per_click_usd, 
   user_ip, user_data
  )
  into postgresql:///hackathon
  target table clicks
  target columns
  (
   id, ad_id, clicked_at, site_url, cost_per_click_usd, user_ip, user_data
  )
  with fields optionally enclosed by '"',
       fields escaped by double-quote,
       fields terminated by ',';
 --
 -- Impressions
 --
 load csv
  from impressions.csv
  (
    id, company_id, ad_id, seen_at, site_url,
    cost_per_impression_usd, user_ip, user_data
  )
  into postgresql:///hackathon
  target table impressions
  target columns
  (
    id, ad_id, seen_at, site_url, cost_per_impression_usd, user_ip, user_data
  )
  with drop indexes,
       fields optionally enclosed by '"',
       fields escaped by double-quote,
       fields terminated by ',';