top of page

Design and create a date dimension

Back to main "Create a data lake"

 

I usually use Apache Parquet Viewer to test the output from Parquet files .

DateDIM.jpg

My first dimension here is a date dimension. Using a date dimension, you can simply add only a key value to the fact table instead of a date. This way it's up to the reader of the data to decide the date format without time consuming ​conversions. 

The example returns 60-year span 30 years back and 30 years into the future. It gets its data from pandas.date_range.

from ast import Index, Str

from datetime import datetime

from email.utils import format_datetime

from operator import index

from zlib import DEF_BUF_SIZE

import pandas as pd

import hashlib as hl

import pyarrow as pa

 

# The scope of the Dimension the default give a 60 years span in the DIM with a fiscal year ending in July

StartDate = '1992-07-29'

EndDate = '2052-10-03'

FiscalEndDate = 'JUL'

OutFile = 'C:\\VsCodeRepo\\tweets\\dateDIM.parquet'

 

def create_dim_date(start_date, end_date, FiscalEndDate):

    '''

    Create Dimension Date in Pandas

   

    :return df_date : DataFrame

    '''

    from pandas.tseries.offsets import MonthEnd, QuarterEnd

 

    # Construct DIM Date Dataframe

    df_date = pd.DataFrame({"Date": pd.date_range(start=f'{start_date}', end=f'{end_date}', freq='D')})

 

    def get_end_of_month(pd_date):

        if pd_date.is_month_end == True:

            return pd_date

        else:

            return pd_date + MonthEnd(1)


 

    def get_end_of_quarter(pd_date):

        if pd_date.is_quarter_end == True:

            return pd_date

        else:

            return pd_date + QuarterEnd(1)

    # Add in attributes

    df_date['DateID'] = df_date.Date.apply(lambda x: x.strftime('%Y%m%d'))

    df_date["DayOfMonth"] = df_date.Date.dt.day

    df_date["DayNameShortEng"] = df_date.Date.dt.strftime("%a")

    df_date["DayNameEng"] = df_date.Date.dt.strftime("%A")

    df_date["DayOfWeek"] = df_date.Date.dt.dayofweek + 1

    df_date["WeekInYear"] = df_date.Date.dt.isocalendar().week

    df_date['WeekText'] = df_date.Date.apply(lambda x: f'Week {x.isocalendar()[1]}')

    df_date["Month"] = df_date.Date.dt.month

    df_date["YearQuarter"] = df_date.Date.apply(lambda x: f'{x.year}Q{x.quarter}')

    df_date["Quarter"] = df_date.Date.dt.quarter

    df_date["QuarterName"] = df_date.Date.apply(lambda x: f'Q{x.quarter}')

    df_date["Period"] =  df_date.Date.apply(lambda x: f'{x.year}{x.month}')

    df_date["FiscalYearQuarter"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate)

    df_date["FiscalQuarter"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate).apply(lambda x: f'{x.quarter}')

    df_date["FiscalQuarterName"] = df_date['Date'].dt.to_period('Q-' + FiscalEndDate).apply(lambda x: f'Q{x.quarter}')

    df_date["Fiscal_Year"] = df_date['Date'].dt.to_period('A-' + FiscalEndDate)

    df_date["Year"] = df_date.Date.dt.year

    df_date['EndOfMonth'] = df_date['Date'].apply(get_end_of_month)

    df_date['EOM_YN'] = df_date['Date'].dt.is_month_end

    df_date['EndOfQuarter'] = df_date['Date'].apply(get_end_of_quarter)

    df_date['EOQ_YN'] = df_date['Date'].dt.is_quarter_end

 

    return df_date

def create_dim_date_with_workday_Period(start_date, end_date, FiscalEndDate):

    '''

    Creates a dimension date that has workday_YN column.

    Also pads a 0 to the period using inserter

    :return df_date : DataFrame

    '''

    # Padding inserter s = In string, a = value added and n = place to add

    def inserter(s,a,n):

        return s[:n]+a+s[n:]

 

    df_date = create_dim_date(start_date, end_date, FiscalEndDate)
 

    # Start with default that it is workday

    df_date['Workday_YN'] = True

    for index, row in df_date.iterrows():

        if row['DayNameEng'] in ['Saturday', 'Sunday']:

            df_date.loc[index, 'Workday_YN'] = False

        date = row['Date'].strftime("%Y-%m-%d")            

        if row['Month'] < 10:

            df_date.loc[index, 'Period'] = inserter(s = row['Period'], a = "0", n = 4)

    return df_date

 

# Prepares the dataframe to be printed to Parquet

df = create_dim_date_with_workday_Period(StartDate, EndDate, FiscalEndDate)

# Strings values to be more represenable in Apache Parquet Viewer

df['Date'] = df['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))

df['EndOfMonth'] = df['EndOfMonth'].apply(lambda x: x.strftime('%Y-%m-%d'))

df['EndOfQuarter'] = df['EndOfQuarter'].apply(lambda x: x.strftime('%Y-%m-%d'))

 

# print(df)

# Using Pyarrow to export Dataframe to parquiet

df.to_parquet(OutFile, use_dictionary=False, engine='pyarrow', compression='gzip')

Back to main "Create a data lake"

FullCode
bottom of page