applied-econometrics-2024/scripts/empirical.py

229 lines
7.4 KiB
Python
Raw Normal View History

2024-12-30 00:35:42 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Empirical assignment 2024 - 2025
STARTER FILE
"""
import os
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import pandas as pd
from helper import print_question, data_frame_to_latex_table_file
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# Start of Script for Empirical assignment Econometrics
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# Set the folders for output of graphs and tables
# -----------------------------------------------------------------------------
# for the figures
data_dir = '../data/'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# for the figures
figure_dir = '../figures/'
if not os.path.exists(figure_dir):
os.makedirs(figure_dir)
# for the latex document
report_dir = '../report/'
if not os.path.exists(report_dir):
os.makedirs(report_dir)
# -----------------------------------------------------------------------------
# Here we set the seed for our group to your group number
# -----------------------------------------------------------------------------
# first birthday
2024-12-30 21:01:10 +01:00
bd_1 = 303
2024-12-30 00:35:42 +01:00
# second birthday
2024-12-30 21:01:10 +01:00
bd_2 = 309
2024-12-30 00:35:42 +01:00
group_seed = bd_1 * bd_2
# set the seed
np.random.seed(group_seed)
# -----------------------------------------------------------------------------
# set the random number generator and seed
# -----------------------------------------------------------------------------
# set the seed and the random number generator for reproducible results
rng = np.random.default_rng(group_seed)
# setting for output printing
print_line_length = 90
print_line_start = 5
# number of x points
num_points = 60
# -----------------------------------------------------------------------------
# Load data 2.1
# -----------------------------------------------------------------------------
data_full = pd.read_stata(data_dir + 'assignment2025.dta')
num_obs = int(0.75 * data_full.shape[0])
# select 75% observations randomly ( the rng uses your seed )
observations = rng.choice (len(data_full), num_obs , replace =False)
# select the observationsfor your group
data = data_full.iloc[observations , :].copy()
#data = data_full
# -----------------------------------------------------------------------------
# Descriptive statistics 2.1
# -----------------------------------------------------------------------------
print_question('Question 2.1: Descriptive Statistics')
# compute the summary statistics
2024-12-30 21:01:10 +01:00
data.drop(['fail', 'urban', 'unearn','househ', 'amtland', 'unearnx'],
axis='columns',
inplace=True)
data = data[data['paidwork']==1]
data['school'] = data['yprim']+data['ysec']
data['wage'] = np.exp(data['lwage'])
data_summary = data.describe()
2024-12-30 22:05:03 +01:00
new_names = ['count', 'mean', 'std', 'min', '25pct', '50pct', '75pct', 'max']
data_summary.index = new_names
2024-12-30 00:35:42 +01:00
# print to screen
2024-12-30 21:01:10 +01:00
print(data_summary.T)
2024-12-30 00:35:42 +01:00
# export the summary statistics to a file
2024-12-30 21:01:10 +01:00
data_frame_to_latex_table_file(report_dir + 'summary_stats.tex',
data_summary.T)
2024-12-30 00:35:42 +01:00
# -----------------------------------------------------------------------------
# Question 2.2
# -----------------------------------------------------------------------------
print_question('Question 2.2: Plot histogram wage / lwage')
2024-12-30 22:05:03 +01:00
plt.hist(data['wage'],bins=21)
plt.savefig(figure_dir + "question_2_2_wage.png")
plt.show()
plt.hist(data['lwage'],bins=21)
plt.savefig(figure_dir + "question_2_2_lwage.png")
plt.show()
2024-12-30 00:35:42 +01:00
# -----------------------------------------------------------------------------
# Question 2.3
# -----------------------------------------------------------------------------
print_question('Question 2.3: Sample correlations')
2024-12-30 22:05:03 +01:00
df = data [['wage', 'age', 'school', 'men', 'malay', 'chinese', 'indian']]
corr = df.corr()
data_frame_to_latex_table_file(report_dir + 'table_2_3.tex',
corr)
2024-12-30 00:35:42 +01:00
# -----------------------------------------------------------------------------
# Question 2.4
# -----------------------------------------------------------------------------
print_question('Question 2.4: Estimate lwage model')
# explanatory variables for question 2.4
# x_vars_24 = data[['smcity', 'AA']] # TODO
# add a constant
# X_24 = sm.add_constant(x_vars_24) [uncomment]
# set-up model
# model_24 = sm.OLS(,) #TODO
# estimate the model
# results_24 = model_24. #TODO
# print the OLS output
# print(results_24.summary()) [uncomment]
# export the coefficients part of the summary to a table
# data_frame_to_latex_table_file(report_dir + 'results_24.tex',
# results_24.summary2().tables[1])
# -----------------------------------------------------------------------------
# Question 2.5
# -----------------------------------------------------------------------------
print_question('Question 2.5: Adding school')
# -----------------------------------------------------------------------------
# Question 2.6
# -----------------------------------------------------------------------------
print_question('Question 2.6: Adding age')
# -----------------------------------------------------------------------------
# Question 2.7
# -----------------------------------------------------------------------------
print_question('Question 2.7: Create the woman variable')
# -----------------------------------------------------------------------------
# Question 2.8
# -----------------------------------------------------------------------------
print_question('Question 2.8: lwage model')
# -----------------------------------------------------------------------------
# Question 2.9
# -----------------------------------------------------------------------------
print_question('Question 2.9: Test ethnicity')
# -----------------------------------------------------------------------------
# Question 2.10
# -----------------------------------------------------------------------------
print_question('Question 2.10: Estimate models separately')
# -----------------------------------------------------------------------------
# Question 2.11
# -----------------------------------------------------------------------------
print_question('Question 2.11: Predict lwage')
# -----------------------------------------------------------------------------
# Question 2.12
# -----------------------------------------------------------------------------
print_question('Question 2.12: Estimate model with squared terms')
# -----------------------------------------------------------------------------
# Question 2.13
# -----------------------------------------------------------------------------
print_question('Question 2.13: Squared terms')
# -----------------------------------------------------------------------------
# Question 2.14
# -----------------------------------------------------------------------------
print_question('Question 2.14: Add interaction terms')
# -----------------------------------------------------------------------------
# Question 2.15
# -----------------------------------------------------------------------------
print_question('Question 2.15: Find your favourite model')