9. Sales app example
class Customer < ActiveRecord::Base
has_many :orders
end
class Order < ActiveRecord::Base
belongs_to :customer
has_many :order_items
end
class OrderItem < ActiveRecord::Base
belongs_to :order
belongs_to :product
end
class Product < ActiveRecord::Base
belongs_to :product_class
has_many :order_items
end
class ProductClass < ActiveRecord::Base
has_many :products
end
24. Dimensional Modeling
What were the
total sales amounts
in California
in Q1 2014
by product families?
fact or measure
Customer / Region dimension
Time dimension
Product dimension
27. Data Warehouse Models
class Dwh::SalesFact < Dwh::Fact
belongs_to :customer, class_name: "Dwh::CustomerDimension"
belongs_to :product, class_name: "Dwh::ProductDimension"
belongs_to :time, class_name: "Dwh::TimeDimension"
end
class Dwh::CustomerDimension < Dwh::Dimension
has_many :sales_facts, class_name: “Dwh::SalesFact",
foreign_key: "customer_id"
end
class Dwh::ProductDimension < Dwh::Dimension
has_many :sales_facts, class_name: "Dwh::SalesFact", foreign_key: "product_id"
belongs_to :product_class, class_name: "Dwh::ProductClassDimension"
end
class Dwh::ProductClassDimension < Dwh::Dimension
has_many :products, class_name: "Dwh::ProductDimension", foreign_key: "product_class_id"
end
class Dwh::TimeDimension < Dwh::Dimension
has_many :sales_facts, class_name: “Dwh::SalesFact",
foreign_key: "time_id"
end
28. Load Dimension
class Dwh::CustomerDimension < Dwh::Dimension
# ...
def self.truncate!
connection.execute "TRUNCATE TABLE #{table_name}"
end
def self.load!
truncate!
column_names = %w(id full_name city state_province country
birth_date gender created_at updated_at)
connection.insert %[
INSERT INTO #{table_name} (#{column_names.join(',')})
SELECT #{column_names.join(',')}
FROM #{::Customer.table_name}
]
end
end
29. Generate
Time
Dimension
class Dwh::TimeDimension < Dwh::Dimension
def self.load!
connection.select_values(%[
SELECT DISTINCT order_date FROM #{Order.table_name}
WHERE order_date NOT IN
(SELECT date_value FROM #{table_name})
]).each do |date|
year, month, day = date.year, date.month, date.day
quarter = ((month-1)/3)+1
quarter_name = "Q#{quarter} #{year}"
month_name = date.strftime("%b %Y")
day_name = date.strftime("%b %d %Y")
sql = send :sanitize_sql_array, [
%[
INSERT INTO #{table_name}
(id, date_value, year, quarter, month, day,
year_name, quarter_name, month_name, day_name)
VALUES
(?, ?, ?, ?, ?, ?,
?, ?, ?, ?)
],
date_to_id(date), date, year, quarter, month, day,
year.to_s, quarter_name, month_name, day_name
]
connection.insert sql
end
end
end
30. Load Facts
class Dwh::SalesFact < Dwh::Fact
def self.load!
truncate!
connection.insert %[
INSERT INTO #{table_name}
(customer_id, product_id, time_id,
sales_quantity, sales_amount, sales_cost)
SELECT
o.customer_id, oi.product_id,
CAST(to_char(o.order_date, 'YYYYMMDD') AS INTEGER),
oi.quantity, oi.amount, oi.cost
FROM
#{OrderItem.table_name} oi
INNER JOIN #{Order.table_name} o ON o.id = oi.order_id
]
end
end
31. What were the
total sales amounts
in California
in Q1 2014
by product families?
Dwh::SalesFact.
joins(:customer).joins(:product => :product_class).joins(:time).
where("d_customers.country" => “USA",
"d_customers.state_province" => "CA").
where("d_time.year" => 2014, "d_time.quarter" => 1).
group("d_product_classes.product_family").
sum("sales_amount")
36. Time Dimension
All Times
2014 2015
Q2 Q3 Q4
AUG SEP
Year
All
Quarter
Month
AUG 01 AUG 02 Day
Q1
JUL
Default
hierarchy
All Times
2014 2015
W2 W3 W4
JAN 18 JAN 19
Year
All
Week
Day
W1
JAN 17
Weekly
hierarchy
39. What were the
total sales amounts
in California
in Q1 2014
by product families?
olap.from("Sales").
columns("[Measures].[Sales Amount]").
rows("[Product].[Product Family].Members").
where("[Customer].[USA].[CA]", "[Time].[Quarter].[Q1 2014]")
40. MDX Query Language
olap.from("Sales").
columns("[Measures].[Sales Amount]").
rows("[Product].[Product Family].Members").
where("[Customer].[USA].[CA]", "[Time].[Quarter].[Q1 2014]")
SELECT {[Measures].[Sales Amount]} ON COLUMNS,
[Product].[Product Family].Members ON ROWS
FROM [Sales]
WHERE ([Customer].[USA].[CA], [Time].[Quarter].[Q1 2014])
41. Results Caching
SELECT {[Measures].[Sales Amount], [Measures].[Sales Cost],
[Measures].[Customers Count]} ON COLUMNS,
[Product].[Product Family].Members ON ROWS
FROM [Sales] (21713.0ms)
SELECT {[Measures].[Sales Amount], [Measures].[Sales Cost],
[Measures].[Customers Count]} ON COLUMNS,
[Product].[Product Family].Members ON ROWS
FROM [Sales] (10.0ms)
42. Additional Attribute Dimension
dimension 'Gender', foreign_key: 'customer_id' do
hierarchy all_member_name: 'All Genders', primary_key: 'id' do
table 'd_customers', schema: 'dwh'
level 'Gender', column: 'gender' do
name_expression do
sql "CASE d_customers.gender
WHEN 'F' THEN ‘Female'
WHEN 'M' THEN ‘Male'
END"
end
end
end
end
olap.from("Sales").
columns("[Measures].[Sales Amount]").
rows("[Gender].[Gender].Members")
43. Dynamic Attribute Dimension
dimension 'Age interval', foreign_key: 'customer_id' do
hierarchy all_member_name: 'All Age', primary_key: 'id' do
table 'd_customers', schema: 'dwh'
level 'Age interval' do
key_expression do
sql %[
CASE
WHEN age(d_customers.birth_date) < interval '20 years'
THEN '< 20 years'
WHEN age(d_customers.birth_date) < interval '30 years'
THEN '20-30 years'
WHEN age(d_customers.birth_date) < interval '40 years'
THEN '30-40 years'
WHEN age(d_customers.birth_date) < interval '50 years'
THEN '40-50 years'
ELSE '50+ years'
END
]
end
end
end
end
[Age interval].[<20 years]
[Age interval].[20-30 years]
[Age interval].[30-40 years]
[Age interval].[40-50 years]
[Age interval].[50+ years]
47. Ruby Tools for ETL
Kiba http://www.kiba-etl.org/
https://github.com/square/ETLETL
48. Kiba example
# declare a ruby method here, for quick reusable logic
def parse_french_date(date)
Date.strptime(date, '%d/%m/%Y')
end
# or better, include a ruby file which loads reusable assets
# eg: commonly used sources / destinations / transforms, under unit-test
require_relative 'common'
# declare a source where to take data from (you implement it - see notes below)
source MyCsvSource, 'input.csv'
# declare a row transform to process a given field
transform do |row|
row[:birth_date] = parse_french_date(row[:birth_date])
# return to keep in the pipeline
row
end
# declare another row transform, dismissing rows conditionally by returning nil
transform do |row|
row[:birth_date].year < 2000 ? row : nil
end
# declare a row transform as a class, which can be tested properly
transform ComplianceCheckTransform, eula: 2015
50. Single
threaded
ETL
class Dwh::TimeDimension < Dwh::Dimension
def self.load!
logger.silence do
connection.select_values(%[
SELECT DISTINCT order_date FROM #{Order.table_name}
WHERE order_date NOT IN (SELECT date_value FROM #{table_name})
]).each do |date|
insert_date(date)
end
end
end
def self.insert_date(date)
year, month, day = date.year, date.month, date.day
quarter = ((month-1)/3)+1
quarter_name = "Q#{quarter} #{year}"
month_name = date.strftime("%b %Y")
day_name = date.strftime("%b %d %Y")
sql = send :sanitize_sql_array, [
%[
INSERT INTO #{table_name}
(id, date_value, year, quarter, month, day,
year_name, quarter_name, month_name, day_name)
VALUES
(?, ?, ?, ?, ?, ?,
?, ?, ?, ?)
],
date_to_id(date), date, year, quarter, month, day,
year.to_s, quarter_name, month_name, day_name
]
connection.insert sql
end
end
51. require 'concurrent/executors'
class Dwh::TimeDimension < Dwh::Dimension
def self.parallel_load!(pool_size = 4)
logger.silence do
insert_date_pool = Concurrent::FixedThreadPool.new(pool_size)
connection.select_values(%[
SELECT DISTINCT order_date FROM #{Order.table_name}
WHERE order_date NOT IN (SELECT date_value FROM #{table_name})
]).each do |date|
insert_date_pool.post(date) do |date|
connection_pool.with_connection do
insert_date(date)
end
end
end
insert_date_pool.shutdown
insert_date_pool.wait_for_termination
end
end
end
ETL with
Thread Pool
56. Analytical Query Performance
SELECT d_product_classes.product_family,
SUM(f_sales.sales_amount) AS sales_amount,
SUM(f_sales.sales_cost) AS sales_cost,
COUNT(DISTINCT f_sales.customer_id) AS customers_count
FROM "dwh"."f_sales"
INNER JOIN "dwh"."d_products" ON "dwh"."d_products"."id" =
"dwh"."f_sales"."product_id"
INNER JOIN "dwh"."d_product_classes" ON "dwh"."d_product_classes"."id" =
"dwh"."d_products"."product_class_id"
GROUP BY d_product_classes.product_family
always ~18 seconds
first ~9 seconds
next ~1.5 seconds
6 million rows
57. When to use what?
Fact table size
Traditional
transactional
databases
Analytical
columnar
databases
< 1M rows OK No big win
1-10M rows
Complex
queries slower
OK
10-100M rows Slow OK
>100M rows Very slow OK with tuning
58. What did we cover?
Problems with analytical queries
Dimensional modeling
Star schemas
Mondrian OLAP and MDX
ETL – Extract, Transform, Load
Analytical columnar databases