#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empirical assignment 2024 - 2025 STARTER FILE """ import os import numpy as np import numpy.linalg as la import matplotlib.pyplot as plt import statsmodels.api as sm import scipy.stats as stats import pandas as pd from helper import print_question, data_frame_to_latex_table_file # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # Start of Script for Empirical assignment Econometrics # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # Set the folders for output of graphs and tables # ----------------------------------------------------------------------------- # for the figures data_dir = '../data/' if not os.path.exists(data_dir): os.makedirs(data_dir) # for the figures figure_dir = '../figures/' if not os.path.exists(figure_dir): os.makedirs(figure_dir) # for the latex document report_dir = '../report/' if not os.path.exists(report_dir): os.makedirs(report_dir) # ----------------------------------------------------------------------------- # Here we set the seed for our group to your group number # ----------------------------------------------------------------------------- # first birthday bd_1 = 303 # second birthday bd_2 = 309 group_seed = bd_1 * bd_2 # set the seed np.random.seed(group_seed) # ----------------------------------------------------------------------------- # set the random number generator and seed # ----------------------------------------------------------------------------- # set the seed and the random number generator for reproducible results rng = np.random.default_rng(group_seed) # setting for output printing print_line_length = 90 print_line_start = 5 # number of x points num_points = 60 # ----------------------------------------------------------------------------- # Load data 2.1 # ----------------------------------------------------------------------------- data_full = pd.read_stata(data_dir + 'assignment2025.dta') num_obs = int(0.75 * data_full.shape[0]) # select 75% observations randomly ( the rng uses your seed ) observations = rng.choice (len(data_full), num_obs , replace =False) # select the observationsfor your group data = data_full.iloc[observations , :].copy() #data = data_full # ----------------------------------------------------------------------------- # Descriptive statistics 2.1 # ----------------------------------------------------------------------------- print_question('Question 2.1: Descriptive Statistics') # compute the summary statistics data.drop(['fail', 'urban', 'unearn','househ', 'amtland', 'unearnx'], axis='columns', inplace=True) data = data[data['paidwork']==1] data['school'] = data['yprim']+data['ysec'] data['wage'] = np.exp(data['lwage']) data_summary = data.describe() new_names = ['count', 'mean', 'std', 'min', '25pct', '50pct', '75pct', 'max'] data_summary.index = new_names # print to screen print(data_summary.T) # export the summary statistics to a file data_frame_to_latex_table_file(report_dir + 'summary_stats.tex', data_summary.T) # ----------------------------------------------------------------------------- # Question 2.2 # ----------------------------------------------------------------------------- print_question('Question 2.2: Plot histogram wage / lwage') plt.hist(data['wage'],bins=21) plt.savefig(figure_dir + "question_2_2_wage.png") plt.show() plt.hist(data['lwage'],bins=21) plt.savefig(figure_dir + "question_2_2_lwage.png") plt.show() # ----------------------------------------------------------------------------- # Question 2.3 # ----------------------------------------------------------------------------- print_question('Question 2.3: Sample correlations') df = data [['wage', 'age', 'school', 'men', 'malay', 'chinese', 'indian']] corr = df.corr() data_frame_to_latex_table_file(report_dir + 'table_2_3.tex', corr) # ----------------------------------------------------------------------------- # Question 2.4 # ----------------------------------------------------------------------------- print_question('Question 2.4: Estimate lwage model') # explanatory variables for question 2.4 x_vars_24 = data[['chinese', 'indian']] # add a constant X_24 = sm.add_constant(x_vars_24) # set-up model model_24 = sm.OLS(data['lwage'], X_24) # estimate the model results_24 = model_24.fit() # print the OLS output print(results_24.summary()) # export the coefficients part of the summary to a table data_frame_to_latex_table_file(report_dir + 'results_24.tex', results_24.summary2().tables[1]) # ----------------------------------------------------------------------------- # Question 2.5 # ----------------------------------------------------------------------------- print_question('Question 2.5: Adding school') # explanatory variables for question 2.5 x_vars_25 = data[['chinese', 'indian', 'school']] # add a constant X_25 = sm.add_constant(x_vars_25) # set-up model model_25 = sm.OLS(data['lwage'], X_25) # estimate the model results_25 = model_25.fit() # print the OLS output print(results_25.summary()) # export the coefficients part of the summary to a table data_frame_to_latex_table_file(report_dir + 'results_25.tex', results_25.summary2().tables[1]) plt.scatter(data['school'], data['lwage']) plt.savefig(figure_dir + "question_2_5.png") plt.show() # ----------------------------------------------------------------------------- # Question 2.6 # ----------------------------------------------------------------------------- print_question('Question 2.6: Adding age') # explanatory variables for question 2.5 x_vars_26 = data[['chinese', 'indian', 'school', 'age']] # add a constant X_26 = sm.add_constant(x_vars_26) # set-up model model_26 = sm.OLS(data['lwage'], X_26) # estimate the model results_26 = model_26.fit() # print the OLS output print(results_26.summary()) # export the coefficients part of the summary to a table data_frame_to_latex_table_file(report_dir + 'results_26.tex', results_26.summary2().tables[1]) coef = results_26.summary2().tables[1]['Coef.'] lwage_26 = data['lwage'] - coef['chinese']*data['chinese'] - coef['indian']*data['indian'] - coef['school']*data['school'] lwage_26 = data['lwage'] - coef['school']*data['school'] plt.scatter(data['age'], data['lwage']) plt.savefig(figure_dir + "question_2_6.png") plt.show() # explanatory variables for question 2.5 x_vars_26b = data[['chinese', 'indian', 'school', 'age', 'agesq']] # add a constant X_26b = sm.add_constant(x_vars_26b) # set-up model model_26b = sm.OLS(data['lwage'], X_26b) # estimate the model results_26b = model_26b.fit() # print the OLS output print(results_26b.summary()) # export the coefficients part of the summary to a table data_frame_to_latex_table_file(report_dir + 'results_26b.tex', results_26b.summary2().tables[1]) # ----------------------------------------------------------------------------- # Question 2.7 # ----------------------------------------------------------------------------- print_question('Question 2.7: Create the woman variable') data['women'] = 1 - data['men'] # ----------------------------------------------------------------------------- # Question 2.8 # ----------------------------------------------------------------------------- print_question('Question 2.8: lwage model') # explanatory variables for question 2.5 x_vars_28 = data[['age', 'school', 'chinese', 'indian', 'men']] # add a constant X_28 = sm.add_constant(x_vars_28) # set-up model model_28 = sm.OLS(data['lwage'], X_28) # estimate the model results_28 = model_28.fit() # print the OLS output print(results_28.summary()) # export the coefficients part of the summary to a table data_frame_to_latex_table_file(report_dir + 'results_28.tex', results_28.summary2().tables[1]) # ----------------------------------------------------------------------------- # Question 2.9 # ----------------------------------------------------------------------------- print_question('Question 2.9: Test ethnicity') # ----------------------------------------------------------------------------- # Question 2.10 # ----------------------------------------------------------------------------- print_question('Question 2.10: Estimate models separately') # ----------------------------------------------------------------------------- # Question 2.11 # ----------------------------------------------------------------------------- print_question('Question 2.11: Predict lwage') # ----------------------------------------------------------------------------- # Question 2.12 # ----------------------------------------------------------------------------- print_question('Question 2.12: Estimate model with squared terms') # ----------------------------------------------------------------------------- # Question 2.13 # ----------------------------------------------------------------------------- print_question('Question 2.13: Squared terms') # ----------------------------------------------------------------------------- # Question 2.14 # ----------------------------------------------------------------------------- print_question('Question 2.14: Add interaction terms') # ----------------------------------------------------------------------------- # Question 2.15 # ----------------------------------------------------------------------------- print_question('Question 2.15: Find your favourite model')