Lab 4 - Julia Code

Contents

Lab 4 - Julia Code#

Authors: Valerie Dube, Erzo Garay, Juan Marcos Guerrero y Matias Villalba

Bootstraping#

using CSV
using DataFrames
using Random
using GLM
using Statistics
using Plots
# Load the data
Penn = CSV.read("../../data/penn_jae.csv", DataFrame)
Penn = Penn[(Penn.tg .== 4) .| (Penn.tg .== 0), :]
Penn.tg[Penn.tg .== 4] .= 1
rename!(Penn, :tg => :t4)
5099×24 DataFrame
5074 rows omitted
RowColumn1abdtt4inuidur1inuidur2femaleblackhispanicothracedepq1q2q3q4q5q6recallagelt35agegt54durablenondurablelusdhusdmuld
Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64
1110824018180000200001000000010
24108240110000000001000000100
3510747027270000000010000000100
412106071990000000100001000001
51310831027270000100001000110100
61410845027271000000001000100100
715108310991000100001000100100
81710859027271000100000100000100
92310516015151000001000000000100
102510663028111000000100000100100
112610747012121000200010011000001
122710551122221010201000001000001
133210768018181000000001000100001
50881389310691027271001000010000010100
50891389410796015151001000001001010100
50901389510635120200001000100001000001
509113896108590111001000000101000100
50921389910796023230100000001000000001
50931390010740113131100000010011000100
509413901108450661001200001000001001
50951390510628110100010000100001000001
509613906105231440010201000000000001
509713907105580990000201000001000100
509813911108171440000000001000100000
50991391210691027270000000010000110100
# Create new columns
Penn.dep1 = (Penn.dep .== 1) .|> Int
Penn.dep2 = (Penn.dep .== 2) .|> Int
Penn.log_inuidur1 = log.(Penn.inuidur1)
select!(Penn, Not(:dep))
5099×26 DataFrame
5074 rows omitted
RowColumn1abdtt4inuidur1inuidur2femaleblackhispanicothraceq1q2q3q4q5q6recallagelt35agegt54durablenondurablelusdhusdmulddep1dep2log_inuidur1
Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64Float64
111082401818000000001000000010012.89037
2410824011000000001000000100000.0
351074702727000000010000000100003.29584
41210607199000000100001000001002.19722
5131083102727000000001000110100103.29584
6141084502727100000001000100100003.29584
71510831099100000001000100100102.19722
8171085902727100000000100000100103.29584
9231051601515100001000000000100002.70805
10251066302811100000100000100100003.3322
11261074701212100000010011000001012.48491
12271055112222101001000001000001013.09104
13321076801818100000001000100001002.89037
5088138931069102727100100010000010100003.29584
5089138941079601515100100001001010100002.70805
5090138951063512020000100100001000001002.99573
50911389610859011100100000101000100000.0
5092138991079602323010000001000000001003.13549
5093139001074011313110000010011000100002.56495
50941390110845066100100001000001001011.79176
5095139051062811010001000100001000001002.30259
50961390610523144001001000000000001011.38629
50971390710558099000001000001000100012.19722
50981391110817144000000001000100000001.38629
5099139121069102727000000010000110100003.29584
# Function to get estimates
function get_estimates(data, index)
    A = select(data, [:t4, :female, :black, :othrace, :dep1, :dep2, :q2, :q3, :q4, :q5, :q6, :agelt35, :agegt54, :durable, :lusd, :husd])
    X = A[index, :]
    y = data.log_inuidur1[index]
    
    model = lm(@formula(log_inuidur1 ~ t4 + female + black + othrace + dep1 + dep2 + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd), data[index, :])
    intercept = coef(model)[1]
    coefficients = coef(model)[2:end]
    return (intercept, coefficients)
end

# Function to get random indices
function get_indices(data, num_samples)
    return rand(1:nrow(data), num_samples)
end

n = nrow(Penn)

# Bootstrap function
function boot(data, func, R)
    coeff_1 = []
    coeff_2 = []
    coeff_3 = []
    for i in 1:R
        indices = get_indices(data, n)
        _, coef = func(data, indices)
        push!(coeff_1, coef[1])
        push!(coeff_2, coef[2])
        push!(coeff_3, coef[3])
    end
    coeff_1_statistics = Dict("estimated_value" => mean(coeff_1), "std_error" => std(coeff_1))
    coeff_2_statistics = Dict("estimated_value" => mean(coeff_2), "std_error" => std(coeff_2))
    coeff_3_statistics = Dict("estimated_value" => mean(coeff_3), "std_error" => std(coeff_3))
    return Dict("coeff_1_statistics" => coeff_1_statistics, "coeff_2_statistics" => coeff_2_statistics, "coeff_3_statistics" => coeff_3_statistics), coeff_1, coeff_2, coeff_3
end
boot (generic function with 1 method)
results = boot(Penn, get_estimates, 1000)

println("Result for coefficient term t4 ", results[1]["coeff_1_statistics"])
println("Result for coefficient term female", results[1]["coeff_2_statistics"])
println("Result for coefficient term black", results[1]["coeff_3_statistics"])
results[1]
Dict{String, Dict{String, Float64}} with 3 entries:
  "coeff_1_statistics" => Dict("estimated_value"=>-0.0704815, "std_error"=>0.03…
  "coeff_2_statistics" => Dict("estimated_value"=>0.126044, "std_error"=>0.0350…
  "coeff_3_statistics" => Dict("estimated_value"=>-0.294649, "std_error"=>0.062…
data = DataFrame(
    Dict(
        "Variable" => ["t4", "female", "black"],
        "Estimate" => [
            results[1]["coeff_1_statistics"]["estimated_value"],
            results[1]["coeff_2_statistics"]["estimated_value"],
            results[1]["coeff_3_statistics"]["estimated_value"]
        ],
        "Standard Error" => [
            results[1]["coeff_1_statistics"]["std_error"],
            results[1]["coeff_2_statistics"]["std_error"],
            results[1]["coeff_3_statistics"]["std_error"]
        ]
    )
)
println(data)
3×3 DataFrame
 Row  Estimate    Standard Error  Variable 
     │ Float64     Float64         String   
─────┼──────────────────────────────────────
   1 │ -0.0704815       0.035352   t4
   2 │  0.126044        0.0350154  female
   3 │ -0.294649        0.0621332  black
# Plotting the histograms
histogram(results[2], bins=20, title="Histogram - t4's coefficient (Density)", label="t4", alpha=0.6, normalize=true)
histogram(results[3], bins=20, title="Histogram - female's coefficient (Density)", label="female", alpha=0.6, normalize=true)
histogram(results[4], bins=20, title="Histogram - black's coefficient (Density)", label="black", alpha=0.6, normalize=true)