applied-econometrics-2024/scripts/empirical.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Empirical assignment 2024 - 2025

STARTER FILE
"""

import os
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import pandas as pd

from helper import print_question, data_frame_to_latex_table_file

# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# Start of Script for Empirical assignment Econometrics
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------

# -----------------------------------------------------------------------------
# Set the folders for output of graphs and tables
# -----------------------------------------------------------------------------

# for the figures
data_dir = '../data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
# for the figures
figure_dir = '../figures/'
if not os.path.exists(figure_dir):
    os.makedirs(figure_dir)
# for the latex document
report_dir = '../report/'
if not os.path.exists(report_dir):
    os.makedirs(report_dir)

# -----------------------------------------------------------------------------
# Here we set the seed for our group to your group number
# -----------------------------------------------------------------------------


# first birthday
bd_1 = 3112
# second birthday
bd_2 = 3112

group_seed = bd_1 * bd_2

# set the seed
np.random.seed(group_seed)

# -----------------------------------------------------------------------------
# set the random number generator and seed
# -----------------------------------------------------------------------------

# set the seed and the random number generator for reproducible results
rng = np.random.default_rng(group_seed)

# setting for output printing
print_line_length = 90
print_line_start = 5

# number of x points
num_points = 60

# -----------------------------------------------------------------------------
# Load data 2.1
# -----------------------------------------------------------------------------

data_full = pd.read_stata(data_dir + 'assignment2025.dta')

num_obs = int(0.75 * data_full.shape[0])
# select 75% observations randomly ( the rng uses your seed )
observations = rng.choice (len(data_full), num_obs , replace =False)
# select the observationsfor your group
data = data_full.iloc[observations , :].copy()

#data = data_full

# -----------------------------------------------------------------------------
# Descriptive statistics 2.1
# -----------------------------------------------------------------------------

print_question('Question 2.1: Descriptive Statistics')

# compute the summary statistics
# data_summary = TODO

# print to screen
# print(data_summary.T) [uncomment]

# export the summary statistics to a file
# data_frame_to_latex_table_file(report_dir + 'summmary_stats.tex',
#                               data_summary.T) [uncomment]

# -----------------------------------------------------------------------------
# Question 2.2
# -----------------------------------------------------------------------------

print_question('Question 2.2: Plot histogram wage / lwage')


# -----------------------------------------------------------------------------
# Question 2.3
# -----------------------------------------------------------------------------

print_question('Question 2.3: Sample correlations')


# -----------------------------------------------------------------------------
# Question 2.4
# -----------------------------------------------------------------------------

print_question('Question 2.4: Estimate lwage model')

# explanatory variables for question 2.4
# x_vars_24 = data[['smcity', 'AA']] # TODO

# add a constant
# X_24 = sm.add_constant(x_vars_24) [uncomment]

# set-up model
# model_24 = sm.OLS(,) #TODO

# estimate the model
# results_24 = model_24. #TODO

# print the OLS output
# print(results_24.summary()) [uncomment]

# export the coefficients part of the summary to a table
# data_frame_to_latex_table_file(report_dir + 'results_24.tex',
#                                results_24.summary2().tables[1])

# -----------------------------------------------------------------------------
# Question 2.5
# -----------------------------------------------------------------------------

print_question('Question 2.5: Adding school')

# -----------------------------------------------------------------------------
# Question 2.6
# -----------------------------------------------------------------------------

print_question('Question 2.6: Adding age')

# -----------------------------------------------------------------------------
# Question 2.7
# -----------------------------------------------------------------------------

print_question('Question 2.7: Create the woman variable')

# -----------------------------------------------------------------------------
# Question 2.8
# -----------------------------------------------------------------------------

print_question('Question 2.8: lwage model')

# -----------------------------------------------------------------------------
# Question 2.9
# -----------------------------------------------------------------------------

print_question('Question 2.9: Test ethnicity')

# -----------------------------------------------------------------------------
# Question 2.10
# -----------------------------------------------------------------------------

print_question('Question 2.10: Estimate models separately')

# -----------------------------------------------------------------------------
# Question 2.11
# -----------------------------------------------------------------------------

print_question('Question 2.11: Predict lwage')

# -----------------------------------------------------------------------------
# Question 2.12
# -----------------------------------------------------------------------------

print_question('Question 2.12: Estimate model with squared terms')

# -----------------------------------------------------------------------------
# Question 2.13
# -----------------------------------------------------------------------------

print_question('Question 2.13: Squared terms')


# -----------------------------------------------------------------------------
# Question 2.14
# -----------------------------------------------------------------------------

print_question('Question 2.14: Add interaction terms')


# -----------------------------------------------------------------------------
# Question 2.15
# -----------------------------------------------------------------------------

print_question('Question 2.15: Find your favourite model')