applied-econometrics-2024/scripts/empirical.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Empirical assignment 2024 - 2025

STARTER FILE
"""

import os
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import pandas as pd

from helper import print_question, data_frame_to_latex_table_file

# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# Start of Script for Empirical assignment Econometrics
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------

# -----------------------------------------------------------------------------
# Set the folders for output of graphs and tables
# -----------------------------------------------------------------------------

# for the figures
data_dir = '../data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
# for the figures
figure_dir = '../figures/'
if not os.path.exists(figure_dir):
    os.makedirs(figure_dir)
# for the latex document
report_dir = '../report/'
if not os.path.exists(report_dir):
    os.makedirs(report_dir)

# -----------------------------------------------------------------------------
# Here we set the seed for our group to your group number
# -----------------------------------------------------------------------------


# first birthday
bd_1 = 303
# second birthday
bd_2 = 309

group_seed = bd_1 * bd_2

# set the seed
np.random.seed(group_seed)

# -----------------------------------------------------------------------------
# set the random number generator and seed
# -----------------------------------------------------------------------------

# set the seed and the random number generator for reproducible results
rng = np.random.default_rng(group_seed)

# setting for output printing
print_line_length = 90
print_line_start = 5

# number of x points
num_points = 60

# -----------------------------------------------------------------------------
# Load data 2.1
# -----------------------------------------------------------------------------

data_full = pd.read_stata(data_dir + 'assignment2025.dta')

num_obs = int(0.75 * data_full.shape[0])
# select 75% observations randomly ( the rng uses your seed )
observations = rng.choice (len(data_full), num_obs , replace =False)
# select the observationsfor your group
data = data_full.iloc[observations , :].copy()

#data = data_full

# -----------------------------------------------------------------------------
# Descriptive statistics 2.1
# -----------------------------------------------------------------------------

print_question('Question 2.1: Descriptive Statistics')

# compute the summary statistics

data.drop(['fail', 'urban', 'unearn','househ', 'amtland', 'unearnx'],
          axis='columns',
          inplace=True)

data = data[data['paidwork']==1]

data['school'] = data['yprim']+data['ysec']
data['wage'] = np.exp(data['lwage'])
data_summary = data.describe()
new_names = ['count', 'mean', 'std', 'min', '25pct', '50pct', '75pct', 'max']
data_summary.index = new_names

# print to screen
print(data_summary.T)

# export the summary statistics to a file
data_frame_to_latex_table_file(report_dir + 'summary_stats.tex',
                               data_summary.T)

# -----------------------------------------------------------------------------
# Question 2.2
# -----------------------------------------------------------------------------

print_question('Question 2.2: Plot histogram wage / lwage')

plt.hist(data['wage'],bins=21)
plt.savefig(figure_dir + "question_2_2_wage.png")
plt.show()
plt.hist(data['lwage'],bins=21)
plt.savefig(figure_dir + "question_2_2_lwage.png")
plt.show()

# -----------------------------------------------------------------------------
# Question 2.3
# -----------------------------------------------------------------------------

print_question('Question 2.3: Sample correlations')

df = data [['wage', 'age', 'school', 'men', 'malay', 'chinese', 'indian']]
corr = df.corr()
data_frame_to_latex_table_file(report_dir + 'table_2_3.tex',
                               corr)

# -----------------------------------------------------------------------------
# Question 2.4
# -----------------------------------------------------------------------------

print_question('Question 2.4: Estimate lwage model')

# explanatory variables for question 2.4
x_vars_24 = data[['chinese', 'indian']]

# add a constant
X_24 = sm.add_constant(x_vars_24)

# set-up model
model_24 = sm.OLS(data['lwage'], X_24)

# estimate the model
results_24 = model_24.fit()

# print the OLS output
print(results_24.summary())

# export the coefficients part of the summary to a table
data_frame_to_latex_table_file(report_dir + 'results_24.tex',
                               results_24.summary2().tables[1])

# -----------------------------------------------------------------------------
# Question 2.5
# -----------------------------------------------------------------------------

print_question('Question 2.5: Adding school')
# explanatory variables for question 2.5
x_vars_25 = data[['chinese', 'indian', 'school']]

# add a constant
X_25 = sm.add_constant(x_vars_25)

# set-up model
model_25 = sm.OLS(data['lwage'], X_25)

# estimate the model
results_25 = model_25.fit()

# print the OLS output
print(results_25.summary())

# export the coefficients part of the summary to a table
data_frame_to_latex_table_file(report_dir + 'results_25.tex',
                               results_25.summary2().tables[1])

plt.scatter(data['school'], data['lwage'])
plt.savefig(figure_dir + "question_2_5.png")
plt.show()
# -----------------------------------------------------------------------------
# Question 2.6
# -----------------------------------------------------------------------------

print_question('Question 2.6: Adding age')

# explanatory variables for question 2.5
x_vars_26 = data[['chinese', 'indian', 'school', 'age']]

# add a constant
X_26 = sm.add_constant(x_vars_26)

# set-up model
model_26 = sm.OLS(data['lwage'], X_26)

# estimate the model
results_26 = model_26.fit()

# print the OLS output
print(results_26.summary())

# export the coefficients part of the summary to a table
data_frame_to_latex_table_file(report_dir + 'results_26.tex',
                               results_26.summary2().tables[1])
coef = results_26.summary2().tables[1]['Coef.']
lwage_26 = data['lwage'] - coef['chinese']*data['chinese'] - coef['indian']*data['indian'] - coef['school']*data['school']
lwage_26 = data['lwage'] - coef['school']*data['school']

plt.scatter(data['age'], data['lwage'])
plt.savefig(figure_dir + "question_2_6.png")
plt.show()

# explanatory variables for question 2.5
x_vars_26b = data[['chinese', 'indian', 'school', 'age', 'agesq']]

# add a constant
X_26b = sm.add_constant(x_vars_26b)

# set-up model
model_26b = sm.OLS(data['lwage'], X_26b)

# estimate the model
results_26b = model_26b.fit()

# print the OLS output
print(results_26b.summary())

# export the coefficients part of the summary to a table
data_frame_to_latex_table_file(report_dir + 'results_26b.tex',
                               results_26b.summary2().tables[1])

# -----------------------------------------------------------------------------
# Question 2.7
# -----------------------------------------------------------------------------

print_question('Question 2.7: Create the woman variable')

data['women'] = 1 - data['men']

# -----------------------------------------------------------------------------
# Question 2.8
# -----------------------------------------------------------------------------

print_question('Question 2.8: lwage model')

# explanatory variables for question 2.5
x_vars_28 = data[['age', 'school', 'chinese', 'indian', 'men']]

# add a constant
X_28 = sm.add_constant(x_vars_28)

# set-up model
model_28 = sm.OLS(data['lwage'], X_28)

# estimate the model
results_28 = model_28.fit()

# print the OLS output
print(results_28.summary())

# export the coefficients part of the summary to a table
data_frame_to_latex_table_file(report_dir + 'results_28.tex',
                               results_28.summary2().tables[1])


# -----------------------------------------------------------------------------
# Question 2.9
# -----------------------------------------------------------------------------

print_question('Question 2.9: Test ethnicity')

# -----------------------------------------------------------------------------
# Question 2.10
# -----------------------------------------------------------------------------

print_question('Question 2.10: Estimate models separately')

# -----------------------------------------------------------------------------
# Question 2.11
# -----------------------------------------------------------------------------

print_question('Question 2.11: Predict lwage')

# -----------------------------------------------------------------------------
# Question 2.12
# -----------------------------------------------------------------------------

print_question('Question 2.12: Estimate model with squared terms')

# -----------------------------------------------------------------------------
# Question 2.13
# -----------------------------------------------------------------------------

print_question('Question 2.13: Squared terms')


# -----------------------------------------------------------------------------
# Question 2.14
# -----------------------------------------------------------------------------

print_question('Question 2.14: Add interaction terms')


# -----------------------------------------------------------------------------
# Question 2.15
# -----------------------------------------------------------------------------

print_question('Question 2.15: Find your favourite model')