Design and create a date dimension

I usually use Apache Parquet Viewer to test the output from Parquet files .

My first dimension here is a date dimension. Using a date dimension, you can simply add only a key value to the fact table instead of a date. This way it's up to the reader of the data to decide the date format without time consuming conversions.

The example returns 60-year span 30 years back and 30 years into the future. It gets its data from pandas.date_range.

from ast import Index, Str

from datetime import datetime

from email.utils import format_datetime

from operator import index

from zlib import DEF_BUF_SIZE

import pandas as pd

import hashlib as hl

import pyarrow as pa

# The scope of the Dimension the default give a 60 years span in the DIM with a fiscal year ending in July

StartDate = '1992-07-29'

EndDate = '2052-10-03'

FiscalEndDate = 'JUL'

OutFile = 'C:\\VsCodeRepo\\tweets\\dateDIM.parquet'

def create_dim_date(start_date, end_date, FiscalEndDate):

'''

Create Dimension Date in Pandas

:return df_date : DataFrame

'''

from pandas.tseries.offsets import MonthEnd, QuarterEnd

# Construct DIM Date Dataframe

df_date = pd.DataFrame({"Date": pd.date_range(start=f'{start_date}', end=f'{end_date}', freq='D')})

def get_end_of_month(pd_date):

if pd_date.is_month_end == True:

return pd_date

else:

return pd_date + MonthEnd(1)

def get_end_of_quarter(pd_date):

if pd_date.is_quarter_end == True:

return pd_date

else:

return pd_date + QuarterEnd(1)

# Add in attributes

df_date['DateID'] = df_date.Date.apply(lambda x: x.strftime('%Y%m%d'))

df_date["DayOfMonth"] = df_date.Date.dt.day

df_date["DayNameShortEng"] = df_date.Date.dt.strftime("%a")

df_date["DayNameEng"] = df_date.Date.dt.strftime("%A")

df_date["DayOfWeek"] = df_date.Date.dt.dayofweek + 1

df_date["WeekInYear"] = df_date.Date.dt.isocalendar().week

df_date['WeekText'] = df_date.Date.apply(lambda x: f'Week {x.isocalendar()[1]}')

df_date["Month"] = df_date.Date.dt.month

df_date["YearQuarter"] = df_date.Date.apply(lambda x: f'{x.year}Q{x.quarter}')

df_date["Quarter"] = df_date.Date.dt.quarter

df_date["QuarterName"] = df_date.Date.apply(lambda x: f'Q{x.quarter}')

df_date["Period"] = df_date.Date.apply(lambda x: f'{x.year}{x.month}')

df_date["FiscalYearQuarter"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate)

df_date["FiscalQuarter"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate).apply(lambda x: f'{x.quarter}')

df_date["FiscalQuarterName"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate).apply(lambda x: f'Q{x.quarter}')

df_date["Fiscal_Year"] = df_date['Date'].dt.to_period('A-' + FiscalEndDate)

df_date["Year"] = df_date.Date.dt.year

df_date['EndOfMonth'] = df_date['Date'].apply(get_end_of_month)

df_date['EOM_YN'] = df_date['Date'].dt.is_month_end

df_date['EndOfQuarter'] = df_date['Date'].apply(get_end_of_quarter)

df_date['EOQ_YN'] = df_date['Date'].dt.is_quarter_end

return df_date

def create_dim_date_with_workday_Period(start_date, end_date, FiscalEndDate):

'''

Creates a dimension date that has workday_YN column.

Also pads a 0 to the period using inserter

:return df_date : DataFrame

'''

# Padding inserter s = In string, a = value added and n = place to add

def inserter(s,a,n):

return s[:n]+a+s[n:]

df_date = create_dim_date(start_date, end_date, FiscalEndDate)

# Start with default that it is workday

df_date['Workday_YN'] = True

for index, row in df_date.iterrows():

if row['DayNameEng'] in ['Saturday', 'Sunday']:

df_date.loc[index, 'Workday_YN'] = False

date = row['Date'].strftime("%Y-%m-%d")

if row['Month'] < 10:

df_date.loc[index, 'Period'] = inserter(s = row['Period'], a = "0", n = 4)

return df_date

# Prepares the dataframe to be printed to Parquet

df = create_dim_date_with_workday_Period(StartDate, EndDate, FiscalEndDate)

# Strings values to be more represenable in Apache Parquet Viewer

df['Date'] = df['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))

df['EndOfMonth'] = df['EndOfMonth'].apply(lambda x: x.strftime('%Y-%m-%d'))

df['EndOfQuarter'] = df['EndOfQuarter'].apply(lambda x: x.strftime('%Y-%m-%d'))

# print(df)

# Using Pyarrow to export Dataframe to parquiet

df.to_parquet(OutFile, use_dictionary=False, engine='pyarrow', compression='gzip')

Back to main "Create a data lake"