Design and create a date dimension
Back to main "Create a data lake"
I usually use Apache Parquet Viewer to test the output from Parquet files .
My first dimension here is a date dimension. Using a date dimension, you can simply add only a key value to the fact table instead of a date. This way it's up to the reader of the data to decide the date format without time consuming conversions.
The example returns 60-year span 30 years back and 30 years into the future. It gets its data from pandas.date_range.
from ast import Index, Str
from datetime import datetime
from email.utils import format_datetime
from operator import index
from zlib import DEF_BUF_SIZE
import pandas as pd
import hashlib as hl
import pyarrow as pa
# The scope of the Dimension the default give a 60 years span in the DIM with a fiscal year ending in July
StartDate = '1992-07-29'
EndDate = '2052-10-03'
FiscalEndDate = 'JUL'
OutFile = 'C:\\VsCodeRepo\\tweets\\dateDIM.parquet'
def create_dim_date(start_date, end_date, FiscalEndDate):
'''
Create Dimension Date in Pandas
:return df_date : DataFrame
'''
from pandas.tseries.offsets import MonthEnd, QuarterEnd
# Construct DIM Date Dataframe
df_date = pd.DataFrame({"Date": pd.date_range(start=f'{start_date}', end=f'{end_date}', freq='D')})
def get_end_of_month(pd_date):
if pd_date.is_month_end == True:
return pd_date
else:
return pd_date + MonthEnd(1)
def get_end_of_quarter(pd_date):
if pd_date.is_quarter_end == True:
return pd_date
else:
return pd_date + QuarterEnd(1)
# Add in attributes
df_date['DateID'] = df_date.Date.apply(lambda x: x.strftime('%Y%m%d'))
df_date["DayOfMonth"] = df_date.Date.dt.day
df_date["DayNameShortEng"] = df_date.Date.dt.strftime("%a")
df_date["DayNameEng"] = df_date.Date.dt.strftime("%A")
df_date["DayOfWeek"] = df_date.Date.dt.dayofweek + 1
df_date["WeekInYear"] = df_date.Date.dt.isocalendar().week
df_date['WeekText'] = df_date.Date.apply(lambda x: f'Week {x.isocalendar()[1]}')
df_date["Month"] = df_date.Date.dt.month
df_date["YearQuarter"] = df_date.Date.apply(lambda x: f'{x.year}Q{x.quarter}')
df_date["Quarter"] = df_date.Date.dt.quarter
df_date["QuarterName"] = df_date.Date.apply(lambda x: f'Q{x.quarter}')
df_date["Period"] = df_date.Date.apply(lambda x: f'{x.year}{x.month}')
df_date["FiscalYearQuarter"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate)
df_date["FiscalQuarter"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate).apply(lambda x: f'{x.quarter}')
df_date["FiscalQuarterName"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate).apply(lambda x: f'Q{x.quarter}')
df_date["Fiscal_Year"] = df_date['Date'].dt.to_period('A-' + FiscalEndDate)
df_date["Year"] = df_date.Date.dt.year
df_date['EndOfMonth'] = df_date['Date'].apply(get_end_of_month)
df_date['EOM_YN'] = df_date['Date'].dt.is_month_end
df_date['EndOfQuarter'] = df_date['Date'].apply(get_end_of_quarter)
df_date['EOQ_YN'] = df_date['Date'].dt.is_quarter_end
return df_date
def create_dim_date_with_workday_Period(start_date, end_date, FiscalEndDate):
'''
Creates a dimension date that has workday_YN column.
Also pads a 0 to the period using inserter
:return df_date : DataFrame
'''
# Padding inserter s = In string, a = value added and n = place to add
def inserter(s,a,n):
return s[:n]+a+s[n:]
df_date = create_dim_date(start_date, end_date, FiscalEndDate)
# Start with default that it is workday
df_date['Workday_YN'] = True
for index, row in df_date.iterrows():
if row['DayNameEng'] in ['Saturday', 'Sunday']:
df_date.loc[index, 'Workday_YN'] = False
date = row['Date'].strftime("%Y-%m-%d")
if row['Month'] < 10:
df_date.loc[index, 'Period'] = inserter(s = row['Period'], a = "0", n = 4)
return df_date
# Prepares the dataframe to be printed to Parquet
df = create_dim_date_with_workday_Period(StartDate, EndDate, FiscalEndDate)
# Strings values to be more represenable in Apache Parquet Viewer
df['Date'] = df['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['EndOfMonth'] = df['EndOfMonth'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['EndOfQuarter'] = df['EndOfQuarter'].apply(lambda x: x.strftime('%Y-%m-%d'))
# print(df)
# Using Pyarrow to export Dataframe to parquiet
df.to_parquet(OutFile, use_dictionary=False, engine='pyarrow', compression='gzip')