Lab 4 - Julia Code#
Authors: Valerie Dube, Erzo Garay, Juan Marcos Guerrero y Matias Villalba
Bootstraping#
using CSV
using DataFrames
using Random
using GLM
using Statistics
using Plots
# Load the data
Penn = CSV.read("../../data/penn_jae.csv", DataFrame)
Penn = Penn[(Penn.tg .== 4) .| (Penn.tg .== 0), :]
Penn.tg[Penn.tg .== 4] .= 1
rename!(Penn, :tg => :t4)
5099×24 DataFrame
5074 rows omitted
Row | Column1 | abdt | t4 | inuidur1 | inuidur2 | female | black | hispanic | othrace | dep | q1 | q2 | q3 | q4 | q5 | q6 | recall | agelt35 | agegt54 | durable | nondurable | lusd | husd | muld |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | |
1 | 1 | 10824 | 0 | 18 | 18 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 4 | 10824 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 5 | 10747 | 0 | 27 | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 12 | 10607 | 1 | 9 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
5 | 13 | 10831 | 0 | 27 | 27 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
6 | 14 | 10845 | 0 | 27 | 27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
7 | 15 | 10831 | 0 | 9 | 9 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
8 | 17 | 10859 | 0 | 27 | 27 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
9 | 23 | 10516 | 0 | 15 | 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
10 | 25 | 10663 | 0 | 28 | 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
11 | 26 | 10747 | 0 | 12 | 12 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
12 | 27 | 10551 | 1 | 22 | 22 | 1 | 0 | 1 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
13 | 32 | 10768 | 0 | 18 | 18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
5088 | 13893 | 10691 | 0 | 27 | 27 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
5089 | 13894 | 10796 | 0 | 15 | 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
5090 | 13895 | 10635 | 1 | 20 | 20 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
5091 | 13896 | 10859 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
5092 | 13899 | 10796 | 0 | 23 | 23 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5093 | 13900 | 10740 | 1 | 13 | 13 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
5094 | 13901 | 10845 | 0 | 6 | 6 | 1 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
5095 | 13905 | 10628 | 1 | 10 | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
5096 | 13906 | 10523 | 1 | 4 | 4 | 0 | 0 | 1 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5097 | 13907 | 10558 | 0 | 9 | 9 | 0 | 0 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
5098 | 13911 | 10817 | 1 | 4 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
5099 | 13912 | 10691 | 0 | 27 | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
# Create new columns
Penn.dep1 = (Penn.dep .== 1) .|> Int
Penn.dep2 = (Penn.dep .== 2) .|> Int
Penn.log_inuidur1 = log.(Penn.inuidur1)
select!(Penn, Not(:dep))
5099×26 DataFrame
5074 rows omitted
Row | Column1 | abdt | t4 | inuidur1 | inuidur2 | female | black | hispanic | othrace | q1 | q2 | q3 | q4 | q5 | q6 | recall | agelt35 | agegt54 | durable | nondurable | lusd | husd | muld | dep1 | dep2 | log_inuidur1 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Float64 | |
1 | 1 | 10824 | 0 | 18 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 2.89037 |
2 | 4 | 10824 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0.0 |
3 | 5 | 10747 | 0 | 27 | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3.29584 |
4 | 12 | 10607 | 1 | 9 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2.19722 |
5 | 13 | 10831 | 0 | 27 | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 3.29584 |
6 | 14 | 10845 | 0 | 27 | 27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3.29584 |
7 | 15 | 10831 | 0 | 9 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 2.19722 |
8 | 17 | 10859 | 0 | 27 | 27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3.29584 |
9 | 23 | 10516 | 0 | 15 | 15 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2.70805 |
10 | 25 | 10663 | 0 | 28 | 11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3.3322 |
11 | 26 | 10747 | 0 | 12 | 12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2.48491 |
12 | 27 | 10551 | 1 | 22 | 22 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 3.09104 |
13 | 32 | 10768 | 0 | 18 | 18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2.89037 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
5088 | 13893 | 10691 | 0 | 27 | 27 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 3.29584 |
5089 | 13894 | 10796 | 0 | 15 | 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 2.70805 |
5090 | 13895 | 10635 | 1 | 20 | 20 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2.99573 |
5091 | 13896 | 10859 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0.0 |
5092 | 13899 | 10796 | 0 | 23 | 23 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3.13549 |
5093 | 13900 | 10740 | 1 | 13 | 13 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2.56495 |
5094 | 13901 | 10845 | 0 | 6 | 6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1.79176 |
5095 | 13905 | 10628 | 1 | 10 | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2.30259 |
5096 | 13906 | 10523 | 1 | 4 | 4 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1.38629 |
5097 | 13907 | 10558 | 0 | 9 | 9 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 2.19722 |
5098 | 13911 | 10817 | 1 | 4 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.38629 |
5099 | 13912 | 10691 | 0 | 27 | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 3.29584 |
# Function to get estimates
function get_estimates(data, index)
A = select(data, [:t4, :female, :black, :othrace, :dep1, :dep2, :q2, :q3, :q4, :q5, :q6, :agelt35, :agegt54, :durable, :lusd, :husd])
X = A[index, :]
y = data.log_inuidur1[index]
model = lm(@formula(log_inuidur1 ~ t4 + female + black + othrace + dep1 + dep2 + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd), data[index, :])
intercept = coef(model)[1]
coefficients = coef(model)[2:end]
return (intercept, coefficients)
end
# Function to get random indices
function get_indices(data, num_samples)
return rand(1:nrow(data), num_samples)
end
n = nrow(Penn)
# Bootstrap function
function boot(data, func, R)
coeff_1 = []
coeff_2 = []
coeff_3 = []
for i in 1:R
indices = get_indices(data, n)
_, coef = func(data, indices)
push!(coeff_1, coef[1])
push!(coeff_2, coef[2])
push!(coeff_3, coef[3])
end
coeff_1_statistics = Dict("estimated_value" => mean(coeff_1), "std_error" => std(coeff_1))
coeff_2_statistics = Dict("estimated_value" => mean(coeff_2), "std_error" => std(coeff_2))
coeff_3_statistics = Dict("estimated_value" => mean(coeff_3), "std_error" => std(coeff_3))
return Dict("coeff_1_statistics" => coeff_1_statistics, "coeff_2_statistics" => coeff_2_statistics, "coeff_3_statistics" => coeff_3_statistics), coeff_1, coeff_2, coeff_3
end
boot (generic function with 1 method)
results = boot(Penn, get_estimates, 1000)
println("Result for coefficient term t4 ", results[1]["coeff_1_statistics"])
println("Result for coefficient term female", results[1]["coeff_2_statistics"])
println("Result for coefficient term black", results[1]["coeff_3_statistics"])
results[1]
Dict{String, Dict{String, Float64}} with 3 entries:
"coeff_1_statistics" => Dict("estimated_value"=>-0.0704815, "std_error"=>0.03…
"coeff_2_statistics" => Dict("estimated_value"=>0.126044, "std_error"=>0.0350…
"coeff_3_statistics" => Dict("estimated_value"=>-0.294649, "std_error"=>0.062…
data = DataFrame(
Dict(
"Variable" => ["t4", "female", "black"],
"Estimate" => [
results[1]["coeff_1_statistics"]["estimated_value"],
results[1]["coeff_2_statistics"]["estimated_value"],
results[1]["coeff_3_statistics"]["estimated_value"]
],
"Standard Error" => [
results[1]["coeff_1_statistics"]["std_error"],
results[1]["coeff_2_statistics"]["std_error"],
results[1]["coeff_3_statistics"]["std_error"]
]
)
)
println(data)
3×3 DataFrame
Row │ Estimate Standard Error Variable
│ Float64 Float64 String
─────┼──────────────────────────────────────
1 │ -0.0704815 0.035352 t4
2 │ 0.126044 0.0350154 female
3 │ -0.294649 0.0621332 black
# Plotting the histograms
histogram(results[2], bins=20, title="Histogram - t4's coefficient (Density)", label="t4", alpha=0.6, normalize=true)
histogram(results[3], bins=20, title="Histogram - female's coefficient (Density)", label="female", alpha=0.6, normalize=true)
histogram(results[4], bins=20, title="Histogram - black's coefficient (Density)", label="black", alpha=0.6, normalize=true)