316 lines
9.8 KiB
Python
316 lines
9.8 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Empirical assignment 2024 - 2025
|
|
|
|
STARTER FILE
|
|
"""
|
|
|
|
import os
|
|
import numpy as np
|
|
import numpy.linalg as la
|
|
import matplotlib.pyplot as plt
|
|
import statsmodels.api as sm
|
|
import scipy.stats as stats
|
|
import pandas as pd
|
|
|
|
from helper import print_question, data_frame_to_latex_table_file
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# -----------------------------------------------------------------------------
|
|
# Start of Script for Empirical assignment Econometrics
|
|
# -----------------------------------------------------------------------------
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Set the folders for output of graphs and tables
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# for the figures
|
|
data_dir = '../data/'
|
|
if not os.path.exists(data_dir):
|
|
os.makedirs(data_dir)
|
|
# for the figures
|
|
figure_dir = '../figures/'
|
|
if not os.path.exists(figure_dir):
|
|
os.makedirs(figure_dir)
|
|
# for the latex document
|
|
report_dir = '../report/'
|
|
if not os.path.exists(report_dir):
|
|
os.makedirs(report_dir)
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Here we set the seed for our group to your group number
|
|
# -----------------------------------------------------------------------------
|
|
|
|
|
|
# first birthday
|
|
bd_1 = 303
|
|
# second birthday
|
|
bd_2 = 309
|
|
|
|
group_seed = bd_1 * bd_2
|
|
|
|
# set the seed
|
|
np.random.seed(group_seed)
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# set the random number generator and seed
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# set the seed and the random number generator for reproducible results
|
|
rng = np.random.default_rng(group_seed)
|
|
|
|
# setting for output printing
|
|
print_line_length = 90
|
|
print_line_start = 5
|
|
|
|
# number of x points
|
|
num_points = 60
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Load data 2.1
|
|
# -----------------------------------------------------------------------------
|
|
|
|
data_full = pd.read_stata(data_dir + 'assignment2025.dta')
|
|
|
|
num_obs = int(0.75 * data_full.shape[0])
|
|
# select 75% observations randomly ( the rng uses your seed )
|
|
observations = rng.choice (len(data_full), num_obs , replace =False)
|
|
# select the observationsfor your group
|
|
data = data_full.iloc[observations , :].copy()
|
|
|
|
#data = data_full
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Descriptive statistics 2.1
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.1: Descriptive Statistics')
|
|
|
|
# compute the summary statistics
|
|
|
|
data.drop(['fail', 'urban', 'unearn','househ', 'amtland', 'unearnx'],
|
|
axis='columns',
|
|
inplace=True)
|
|
|
|
data = data[data['paidwork']==1]
|
|
|
|
data['school'] = data['yprim']+data['ysec']
|
|
data['wage'] = np.exp(data['lwage'])
|
|
data_summary = data.describe()
|
|
new_names = ['count', 'mean', 'std', 'min', '25pct', '50pct', '75pct', 'max']
|
|
data_summary.index = new_names
|
|
|
|
# print to screen
|
|
print(data_summary.T)
|
|
|
|
# export the summary statistics to a file
|
|
data_frame_to_latex_table_file(report_dir + 'summary_stats.tex',
|
|
data_summary.T)
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.2
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.2: Plot histogram wage / lwage')
|
|
|
|
plt.hist(data['wage'],bins=21)
|
|
plt.savefig(figure_dir + "question_2_2_wage.png")
|
|
plt.show()
|
|
plt.hist(data['lwage'],bins=21)
|
|
plt.savefig(figure_dir + "question_2_2_lwage.png")
|
|
plt.show()
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.3
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.3: Sample correlations')
|
|
|
|
df = data [['wage', 'age', 'school', 'men', 'malay', 'chinese', 'indian']]
|
|
corr = df.corr()
|
|
data_frame_to_latex_table_file(report_dir + 'table_2_3.tex',
|
|
corr)
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.4
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.4: Estimate lwage model')
|
|
|
|
# explanatory variables for question 2.4
|
|
x_vars_24 = data[['chinese', 'indian']]
|
|
|
|
# add a constant
|
|
X_24 = sm.add_constant(x_vars_24)
|
|
|
|
# set-up model
|
|
model_24 = sm.OLS(data['lwage'], X_24)
|
|
|
|
# estimate the model
|
|
results_24 = model_24.fit()
|
|
|
|
# print the OLS output
|
|
print(results_24.summary())
|
|
|
|
# export the coefficients part of the summary to a table
|
|
data_frame_to_latex_table_file(report_dir + 'results_24.tex',
|
|
results_24.summary2().tables[1])
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.5
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.5: Adding school')
|
|
# explanatory variables for question 2.5
|
|
x_vars_25 = data[['chinese', 'indian', 'school']]
|
|
|
|
# add a constant
|
|
X_25 = sm.add_constant(x_vars_25)
|
|
|
|
# set-up model
|
|
model_25 = sm.OLS(data['lwage'], X_25)
|
|
|
|
# estimate the model
|
|
results_25 = model_25.fit()
|
|
|
|
# print the OLS output
|
|
print(results_25.summary())
|
|
|
|
# export the coefficients part of the summary to a table
|
|
data_frame_to_latex_table_file(report_dir + 'results_25.tex',
|
|
results_25.summary2().tables[1])
|
|
|
|
plt.scatter(data['school'], data['lwage'])
|
|
plt.savefig(figure_dir + "question_2_5.png")
|
|
plt.show()
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.6
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.6: Adding age')
|
|
|
|
# explanatory variables for question 2.5
|
|
x_vars_26 = data[['chinese', 'indian', 'school', 'age']]
|
|
|
|
# add a constant
|
|
X_26 = sm.add_constant(x_vars_26)
|
|
|
|
# set-up model
|
|
model_26 = sm.OLS(data['lwage'], X_26)
|
|
|
|
# estimate the model
|
|
results_26 = model_26.fit()
|
|
|
|
# print the OLS output
|
|
print(results_26.summary())
|
|
|
|
# export the coefficients part of the summary to a table
|
|
data_frame_to_latex_table_file(report_dir + 'results_26.tex',
|
|
results_26.summary2().tables[1])
|
|
coef = results_26.summary2().tables[1]['Coef.']
|
|
lwage_26 = data['lwage'] - coef['chinese']*data['chinese'] - coef['indian']*data['indian'] - coef['school']*data['school']
|
|
lwage_26 = data['lwage'] - coef['school']*data['school']
|
|
|
|
plt.scatter(data['age'], data['lwage'])
|
|
plt.savefig(figure_dir + "question_2_6.png")
|
|
plt.show()
|
|
|
|
# explanatory variables for question 2.5
|
|
x_vars_26b = data[['chinese', 'indian', 'school', 'age', 'agesq']]
|
|
|
|
# add a constant
|
|
X_26b = sm.add_constant(x_vars_26b)
|
|
|
|
# set-up model
|
|
model_26b = sm.OLS(data['lwage'], X_26b)
|
|
|
|
# estimate the model
|
|
results_26b = model_26b.fit()
|
|
|
|
# print the OLS output
|
|
print(results_26b.summary())
|
|
|
|
# export the coefficients part of the summary to a table
|
|
data_frame_to_latex_table_file(report_dir + 'results_26b.tex',
|
|
results_26b.summary2().tables[1])
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.7
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.7: Create the woman variable')
|
|
|
|
data['women'] = 1 - data['men']
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.8
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.8: lwage model')
|
|
|
|
# explanatory variables for question 2.5
|
|
x_vars_28 = data[['age', 'school', 'chinese', 'indian', 'men']]
|
|
|
|
# add a constant
|
|
X_28 = sm.add_constant(x_vars_28)
|
|
|
|
# set-up model
|
|
model_28 = sm.OLS(data['lwage'], X_28)
|
|
|
|
# estimate the model
|
|
results_28 = model_28.fit()
|
|
|
|
# print the OLS output
|
|
print(results_28.summary())
|
|
|
|
# export the coefficients part of the summary to a table
|
|
data_frame_to_latex_table_file(report_dir + 'results_28.tex',
|
|
results_28.summary2().tables[1])
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.9
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.9: Test ethnicity')
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.10
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.10: Estimate models separately')
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.11
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.11: Predict lwage')
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.12
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.12: Estimate model with squared terms')
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.13
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.13: Squared terms')
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.14
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.14: Add interaction terms')
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Question 2.15
|
|
# -----------------------------------------------------------------------------
|
|
|
|
print_question('Question 2.15: Find your favourite model')
|
|
|