diff --git a/figures/question_2_5.png b/figures/question_2_5.png new file mode 100644 index 0000000..ec089ae Binary files /dev/null and b/figures/question_2_5.png differ diff --git a/figures/question_2_6.png b/figures/question_2_6.png new file mode 100644 index 0000000..e00f391 Binary files /dev/null and b/figures/question_2_6.png differ diff --git a/report/Assignment.tex b/report/Assignment.tex index 75ffabc..dbf2b22 100644 --- a/report/Assignment.tex +++ b/report/Assignment.tex @@ -257,44 +257,6 @@ Because $Var(x_1)$ changed from 36 in to 1, we expect the standard error to be $ If the standard deviation from $x_1$ changes to 0, $\beta_1$ cannot we calculated. As we have seen with the no multicollinearity assumption. -\section{examples} -Some greek letters: - -$\alpha$ -$\beta$ -$\gamma$ -$\theta$ -$\varepsilon$ -$\pi$ -$\lambda$ -$\tau$ - -$x=x+27$ -x=x+27 - - - -$A \Longrightarrow B$ - - -$\underbrace{abs}_{test}$ - -sub and superscript - -$\beta_0$ -$\sum_{i=1}^{n} i$ - -In an equation: - -\begin{equation} -\sum_{j=1}{n} j^2 \beta -\end{equation} - -Equation without number - -\begin{equation*} -A \Rightarrow B -\end{equation*} \section{Empirical Investigation} @@ -315,13 +277,13 @@ We retain 2510 observations. \subsection{Question 2.2} -\begin{figure} +\begin{figure} [ht] \includegraphics[width=0.6\paperwidth]{../figures/question_2_2_wage} \caption{Histogram wage} \label{fig::question_2_2_wage} \end{figure} -\begin{figure} +\begin{figure} [ht] \includegraphics[width=0.6\paperwidth]{../figures/question_2_2_lwage} \caption{Histogram lwage} \label{fig::question_2_2_lwage} @@ -331,6 +293,8 @@ The lwage histogram in fig \ref{fig::question_2_2_lwage} is nicely centered so t \subsection{Question 2.3} +We are going to investigate the correlation between the variables wage, age, school, man, malay, chinese and indian. + \begin{table}[ht] \centering \input{table_2_3} @@ -338,10 +302,86 @@ The lwage histogram in fig \ref{fig::question_2_2_lwage} is nicely centered so t \label{tab::table_2_3} \end{table} -We can see that there is a positive correlation between wage and school. It means that people who go longer to school will get a higher wage. There is a negative correlation between age and school. The younger generation is higher educated than older generation. Chinese citizens are better payed than malay, indian citizens have a negative correlation with wage. +We can see in table \ref{tab::table_2_3} that there is a positive correlation between wage and school. It means that people who go longer to school will get a higher wage. There is a negative correlation between age and school. The younger generation is higher educated than older generation. Chinese citizens are better payed than malay, indian citizens have a negative correlation with wage. \subsection{Question 2.4} +We estimate a regression for lwage using the variables chinese and indian. We can calculate malay influence from the results. +\begin{table}[ht] +\centering +\input{results_24} +\caption{Linear model lwage} +\label{tab::results_24} +\end{table} + + $R^{2} = 00.0255$ this means that there is a very weak correlation found. If we look at the coefficients in table \ref{tab::results_24}. We see a negative value of 0.17 for indian and a positive value of 0.14 for chinese. This gives a slightly positive value for the malay of $0.14 - 0.17 + 0.03 = 0$ +This results implicate that there is a wage gap based on ethnicity. + +\subsection{Question 2.5} +We estimate a regression for lwage using the variables chinese, indian and school. + +\begin{table}[ht] +\centering +\input{results_25} +\caption{Linear model lwage/school} +\label{tab::results_25} +\end{table} + + $R^{2} = 0.224$ this means that there is a weak correlation found. If we look at the coefficients in table \ref{tab::results_25}. We see a negative value of 0.07 for indian and a positive value of 0.18 for chinese. This gives a negative value for the malay of $0.18 - 0.07 - 0.11 = 0$ + +To see if lwage vs years of schooling is not linear. We plot it: + +\begin{figure} [ht] +\includegraphics[width=0.6\paperwidth]{../figures/question_2_5} +\caption{lwage vs school} +\label{fig::question_2_5} +\end{figure} + +In figure \ref{fig::question_2_5} there is no obvious non-linearity. + +\subsection{Question 2.6} +We estimate a regression for lwage using the variables chinese, indian and school. + +\begin{table}[ht] +\centering +\input{results_26} +\caption{Linear model lwage/age} +\label{tab::results_26} +\end{table} + + $R^{2} = 0.370$ this means that there is a weak correlation found. If we look at the coefficients in table \ref{tab::results_26}. + +To see if lwage vs age of schooling is not linear. We plot it: + +\begin{figure} [ht] +\includegraphics[width=0.6\paperwidth]{../figures/question_2_6} +\caption{lwage vs age} +\label{fig::question_2_6} +\end{figure} + +In figure \ref{fig::question_2_6} there is a banana shaped model indicating an non-linear relationship with a peak earnings around 40. +We can use \textbf{agesq} to do a parabolic fit. If we run this model we get: + +\begin{table}[ht] +\centering +\input{results_26b} +\caption{parabolic model lwage/age} +\label{tab::results_26b} +\end{table} + +We find a $R^{2} = 0.429$ which is higher than without the agesq. +In table \ref{tab::results_26b} a negative coefficient for agesq wich explains the parabolic distribution with a maximum. + +\subsection{Question 2.8} + +\begin{table}[ht] +\centering +\input{results_28} +\caption{Linear model 2.8} +\label{tab::results_28} +\end{table} + +From the table \ref{tab::results_28} we can conclude that age does not differ substantially between the tables. For school we see a little difference. \end{document} diff --git a/report/results_24.tex b/report/results_24.tex new file mode 100644 index 0000000..42ef854 --- /dev/null +++ b/report/results_24.tex @@ -0,0 +1,9 @@ +\begin{tabular}{lrrrrrr} +\toprule + & Coef. & Std.Err. & t & P>|t| & [0.025 & 0.975] \\ +\midrule +const & 0.793313 & 0.022022 & 36.023272 & 0.000000 & 0.750129 & 0.836496 \\ +chinese & 0.138678 & 0.035496 & 3.906889 & 0.000096 & 0.069074 & 0.208282 \\ +indian & -0.173653 & 0.034758 & -4.996017 & 0.000001 & -0.241812 & -0.105495 \\ +\bottomrule +\end{tabular} diff --git a/report/results_25.tex b/report/results_25.tex new file mode 100644 index 0000000..8bebbdc --- /dev/null +++ b/report/results_25.tex @@ -0,0 +1,10 @@ +\begin{tabular}{lrrrrrr} +\toprule + & Coef. & Std.Err. & t & P>|t| & [0.025 & 0.975] \\ +\midrule +const & 0.039704 & 0.035695 & 1.112307 & 0.266113 & -0.030291 & 0.109699 \\ +chinese & 0.189360 & 0.031751 & 5.963947 & 0.000000 & 0.127099 & 0.251620 \\ +indian & -0.066426 & 0.031317 & -2.121045 & 0.034016 & -0.127836 & -0.005015 \\ +school & 0.087562 & 0.003462 & 25.294557 & 0.000000 & 0.080774 & 0.094350 \\ +\bottomrule +\end{tabular} diff --git a/report/results_26.tex b/report/results_26.tex new file mode 100644 index 0000000..5493a73 --- /dev/null +++ b/report/results_26.tex @@ -0,0 +1,11 @@ +\begin{tabular}{lrrrrrr} +\toprule + & Coef. & Std.Err. & t & P>|t| & [0.025 & 0.975] \\ +\midrule +const & -1.128876 & 0.058122 & -19.422515 & 0.000000 & -1.242848 & -1.014904 \\ +chinese & 0.203956 & 0.028611 & 7.128603 & 0.000000 & 0.147853 & 0.260060 \\ +indian & -0.010308 & 0.028310 & -0.364116 & 0.715802 & -0.065821 & 0.045205 \\ +school & 0.114892 & 0.003318 & 34.628243 & 0.000000 & 0.108386 & 0.121398 \\ +age & 0.028072 & 0.001163 & 24.136639 & 0.000000 & 0.025791 & 0.030353 \\ +\bottomrule +\end{tabular} diff --git a/report/results_26b.tex b/report/results_26b.tex new file mode 100644 index 0000000..47a8891 --- /dev/null +++ b/report/results_26b.tex @@ -0,0 +1,12 @@ +\begin{tabular}{lrrrrrr} +\toprule + & Coef. & Std.Err. & t & P>|t| & [0.025 & 0.975] \\ +\midrule +const & -2.689961 & 0.111653 & -24.092167 & 0.000000 & -2.908903 & -2.471020 \\ +chinese & 0.216072 & 0.027252 & 7.928647 & 0.000000 & 0.162633 & 0.269511 \\ +indian & -0.004273 & 0.026958 & -0.158500 & 0.874076 & -0.057134 & 0.048589 \\ +school & 0.106121 & 0.003206 & 33.103889 & 0.000000 & 0.099835 & 0.112408 \\ +age & 0.126927 & 0.006240 & 20.341402 & 0.000000 & 0.114691 & 0.139163 \\ +agesq & -0.135899 & 0.008442 & -16.098099 & 0.000000 & -0.152453 & -0.119345 \\ +\bottomrule +\end{tabular} diff --git a/report/results_28.tex b/report/results_28.tex new file mode 100644 index 0000000..e6d3678 --- /dev/null +++ b/report/results_28.tex @@ -0,0 +1,12 @@ +\begin{tabular}{lrrrrrr} +\toprule + & Coef. & Std.Err. & t & P>|t| & [0.025 & 0.975] \\ +\midrule +const & -1.205128 & 0.057278 & -21.039804 & 0.000000 & -1.317446 & -1.092810 \\ +age & 0.025987 & 0.001154 & 22.523092 & 0.000000 & 0.023724 & 0.028249 \\ +school & 0.111415 & 0.003261 & 34.169989 & 0.000000 & 0.105021 & 0.117809 \\ +chinese & 0.221610 & 0.028027 & 7.907069 & 0.000000 & 0.166652 & 0.276568 \\ +indian & 0.013045 & 0.027769 & 0.469753 & 0.638572 & -0.041408 & 0.067498 \\ +men & 0.258888 & 0.024088 & 10.747734 & 0.000000 & 0.211654 & 0.306122 \\ +\bottomrule +\end{tabular} diff --git a/report/table_2_3.tex b/report/table_2_3.tex index faf56f3..4b53ab3 100644 --- a/report/table_2_3.tex +++ b/report/table_2_3.tex @@ -1,20 +1,13 @@ -\begin{tabular}{lrrrrrrrr} +\begin{tabular}{lrrrrrrr} \toprule - & count & mean & std & min & 25\% & 50\% & 75\% & max \\ + & wage & age & school & men & malay & chinese & indian \\ \midrule -paidwork & 2510.000000 & 1.000000 & 0.000000 & 1.000000 & 1.000000 & 1.000000 & 1.000000 & 1.000000 \\ -lwage & 2510.000000 & 0.780391 & 0.737255 & -3.336058 & 0.299333 & 0.766255 & 1.241741 & 4.208274 \\ -men & 2510.000000 & 0.624303 & 0.484398 & 0.000000 & 0.000000 & 1.000000 & 1.000000 & 1.000000 \\ -malay & 2510.000000 & 0.435458 & 0.495919 & 0.000000 & 0.000000 & 0.000000 & 1.000000 & 1.000000 \\ -chinese & 2510.000000 & 0.272510 & 0.445334 & 0.000000 & 0.000000 & 0.000000 & 1.000000 & 1.000000 \\ -indian & 2510.000000 & 0.292032 & 0.454793 & 0.000000 & 0.000000 & 0.000000 & 1.000000 & 1.000000 \\ -age & 2510.000000 & 33.025101 & 10.699703 & 15.000000 & 25.000000 & 31.000000 & 39.000000 & 65.000000 \\ -agesq & 2510.000000 & 12.050953 & 7.977792 & 2.250000 & 6.250000 & 9.610000 & 15.210000 & 42.250000 \\ -gexpr & 2510.000000 & 18.933865 & 12.482897 & 0.000000 & 9.000000 & 16.000000 & 26.000000 & 59.000000 \\ -gexprsq & 2510.000000 & 5.142518 & 6.277625 & 0.000000 & 0.810000 & 2.560000 & 6.760000 & 34.810001 \\ -yprim & 2510.000000 & 5.277291 & 1.711691 & 0.000000 & 6.000000 & 6.000000 & 6.000000 & 6.000000 \\ -ysec & 2510.000000 & 2.813944 & 2.704680 & 0.000000 & 0.000000 & 3.000000 & 5.000000 & 14.000000 \\ -school & 2510.000000 & 8.091235 & 3.783405 & 0.000000 & 6.000000 & 9.000000 & 11.000000 & 20.000000 \\ -wage & 2510.000000 & 2.903078 & 2.886990 & 0.035577 & 1.348958 & 2.151692 & 3.461635 & 67.240379 \\ +wage & 1.000000 & 0.206329 & 0.406310 & 0.172137 & 0.014109 & 0.112021 & -0.125078 \\ +age & 0.206329 & 1.000000 & -0.333795 & 0.144930 & 0.018370 & 0.015293 & -0.035007 \\ +school & 0.406310 & -0.333795 & 1.000000 & 0.056334 & 0.119656 & -0.010267 & -0.120423 \\ +men & 0.172137 & 0.144930 & 0.056334 & 1.000000 & 0.097288 & -0.027757 & -0.078907 \\ +malay & 0.014109 & 0.018370 & 0.119656 & 0.097288 & 1.000000 & -0.537530 & -0.564071 \\ +chinese & 0.112021 & 0.015293 & -0.010267 & -0.027757 & -0.537530 & 1.000000 & -0.393085 \\ +indian & -0.125078 & -0.035007 & -0.120423 & -0.078907 & -0.564071 & -0.393085 & 1.000000 \\ \bottomrule \end{tabular} diff --git a/scripts/empirical.py b/scripts/empirical.py index 0e91d70..0e3f53f 100644 --- a/scripts/empirical.py +++ b/scripts/empirical.py @@ -140,48 +140,136 @@ data_frame_to_latex_table_file(report_dir + 'table_2_3.tex', print_question('Question 2.4: Estimate lwage model') # explanatory variables for question 2.4 -# x_vars_24 = data[['smcity', 'AA']] # TODO +x_vars_24 = data[['chinese', 'indian']] # add a constant -# X_24 = sm.add_constant(x_vars_24) [uncomment] +X_24 = sm.add_constant(x_vars_24) # set-up model -# model_24 = sm.OLS(,) #TODO +model_24 = sm.OLS(data['lwage'], X_24) # estimate the model -# results_24 = model_24. #TODO +results_24 = model_24.fit() # print the OLS output -# print(results_24.summary()) [uncomment] +print(results_24.summary()) # export the coefficients part of the summary to a table -# data_frame_to_latex_table_file(report_dir + 'results_24.tex', -# results_24.summary2().tables[1]) +data_frame_to_latex_table_file(report_dir + 'results_24.tex', + results_24.summary2().tables[1]) # ----------------------------------------------------------------------------- # Question 2.5 # ----------------------------------------------------------------------------- print_question('Question 2.5: Adding school') +# explanatory variables for question 2.5 +x_vars_25 = data[['chinese', 'indian', 'school']] +# add a constant +X_25 = sm.add_constant(x_vars_25) + +# set-up model +model_25 = sm.OLS(data['lwage'], X_25) + +# estimate the model +results_25 = model_25.fit() + +# print the OLS output +print(results_25.summary()) + +# export the coefficients part of the summary to a table +data_frame_to_latex_table_file(report_dir + 'results_25.tex', + results_25.summary2().tables[1]) + +plt.scatter(data['school'], data['lwage']) +plt.savefig(figure_dir + "question_2_5.png") +plt.show() # ----------------------------------------------------------------------------- # Question 2.6 # ----------------------------------------------------------------------------- print_question('Question 2.6: Adding age') +# explanatory variables for question 2.5 +x_vars_26 = data[['chinese', 'indian', 'school', 'age']] + +# add a constant +X_26 = sm.add_constant(x_vars_26) + +# set-up model +model_26 = sm.OLS(data['lwage'], X_26) + +# estimate the model +results_26 = model_26.fit() + +# print the OLS output +print(results_26.summary()) + +# export the coefficients part of the summary to a table +data_frame_to_latex_table_file(report_dir + 'results_26.tex', + results_26.summary2().tables[1]) +coef = results_26.summary2().tables[1]['Coef.'] +lwage_26 = data['lwage'] - coef['chinese']*data['chinese'] - coef['indian']*data['indian'] - coef['school']*data['school'] +lwage_26 = data['lwage'] - coef['school']*data['school'] + +plt.scatter(data['age'], data['lwage']) +plt.savefig(figure_dir + "question_2_6.png") +plt.show() + +# explanatory variables for question 2.5 +x_vars_26b = data[['chinese', 'indian', 'school', 'age', 'agesq']] + +# add a constant +X_26b = sm.add_constant(x_vars_26b) + +# set-up model +model_26b = sm.OLS(data['lwage'], X_26b) + +# estimate the model +results_26b = model_26b.fit() + +# print the OLS output +print(results_26b.summary()) + +# export the coefficients part of the summary to a table +data_frame_to_latex_table_file(report_dir + 'results_26b.tex', + results_26b.summary2().tables[1]) + # ----------------------------------------------------------------------------- # Question 2.7 # ----------------------------------------------------------------------------- print_question('Question 2.7: Create the woman variable') +data['women'] = 1 - data['men'] + # ----------------------------------------------------------------------------- # Question 2.8 # ----------------------------------------------------------------------------- print_question('Question 2.8: lwage model') +# explanatory variables for question 2.5 +x_vars_28 = data[['age', 'school', 'chinese', 'indian', 'men']] + +# add a constant +X_28 = sm.add_constant(x_vars_28) + +# set-up model +model_28 = sm.OLS(data['lwage'], X_28) + +# estimate the model +results_28 = model_28.fit() + +# print the OLS output +print(results_28.summary()) + +# export the coefficients part of the summary to a table +data_frame_to_latex_table_file(report_dir + 'results_28.tex', + results_28.summary2().tables[1]) + + # ----------------------------------------------------------------------------- # Question 2.9 # -----------------------------------------------------------------------------