Lab 5 - Julia Code

Lab 5 - Julia Code#

Authors: Valerie Dube, Erzo Garay, Juan Marcos Guerrero y Matias Villalba

# Libraries
using DataFrames
using CSV
using StatsPlots
using Statistics

using StatsModels
using GLM

# Import data and see first observations
df = CSV.read("../../data/processed_esti.csv", DataFrame)
first(df, 5)

5×15 DataFrame

Row	y	w	gender_female	gender_male	gender_transgender	ethnicgrp_asian	ethnicgrp_black	ethnicgrp_mixed_multiple	ethnicgrp_other	ethnicgrp_white	partners1	postlaunch	msm	age	imd_decile
	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64
1	1	1	0	1	0	0	0	1	0	0	0	1	0	27	5
2	0	0	0	1	0	0	0	0	0	1	0	0	0	19	6
3	0	1	0	1	0	0	1	0	0	0	0	1	0	26	4
4	0	0	1	0	0	0	0	0	0	1	1	0	0	20	2
5	1	1	1	0	0	1	0	0	0	0	0	1	0	24	3

describe(df)

15×7 DataFrame

Row	variable	mean	min	median	max	nmissing	eltype
	Symbol	Float64	Int64	Float64	Int64	Int64	DataType
1	y	0.351926	0	0.0	1	0	Int64
2	w	0.529615	0	1.0	1	0	Int64
3	gender_female	0.584244	0	1.0	1	0	Int64
4	gender_male	0.413456	0	0.0	1	0	Int64
5	gender_transgender	0.00230017	0	0.0	1	0	Int64
6	ethnicgrp_asian	0.0638298	0	0.0	1	0	Int64
7	ethnicgrp_black	0.0862565	0	0.0	1	0	Int64
8	ethnicgrp_mixed_multiple	0.0885566	0	0.0	1	0	Int64
9	ethnicgrp_other	0.013226	0	0.0	1	0	Int64
10	ethnicgrp_white	0.748131	0	1.0	1	0	Int64
11	partners1	0.296722	0	0.0	1	0	Int64
12	postlaunch	0.516964	0	1.0	1	0	Int64
13	msm	0.130535	0	0.0	1	0	Int64
14	age	23.1064	16	23.0	30	0	Int64
15	imd_decile	3.47154	1	3.0	9	0	Int64

control = select(filter(row -> row[:w] == 0, df), Not(:y))
treatment = select(filter(row -> row[:w] == 1, df), Not(:y))

921×14 DataFrame

896 rows omitted

Row	w	gender_female	gender_male	gender_transgender	ethnicgrp_asian	ethnicgrp_black	ethnicgrp_mixed_multiple	ethnicgrp_other	ethnicgrp_white	partners1	postlaunch	msm	age	imd_decile
	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64	Int64
1	1	0	1	0	0	0	1	0	0	0	1	0	27	5
2	1	0	1	0	0	1	0	0	0	0	1	0	26	4
3	1	1	0	0	1	0	0	0	0	0	1	0	24	3
4	1	0	1	0	0	0	0	0	1	0	1	0	24	2
5	1	1	0	0	0	0	0	0	1	0	0	0	24	4
6	1	0	1	0	0	0	0	0	1	1	0	0	27	2
7	1	1	0	0	0	1	0	0	0	0	0	0	21	6
8	1	0	1	0	0	0	0	0	1	0	0	1	18	4
9	1	0	1	0	0	0	0	0	1	1	1	1	26	2
10	1	0	1	0	0	0	0	0	1	0	1	1	23	6
11	1	1	0	0	0	0	1	0	0	1	1	0	24	4
12	1	1	0	0	0	1	0	0	0	0	1	0	26	3
13	1	1	0	0	0	0	0	0	1	0	0	0	26	3
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮
910	1	1	0	0	1	0	0	0	0	1	0	0	28	4
911	1	0	1	0	0	0	0	0	1	0	1	0	25	2
912	1	1	0	0	0	1	0	0	0	1	0	0	18	2
913	1	0	1	0	0	0	0	0	1	0	1	1	26	4
914	1	0	1	0	0	0	0	0	1	0	1	0	21	3
915	1	0	1	0	0	0	0	0	1	0	0	0	19	4
916	1	0	1	0	0	0	0	0	1	0	0	0	21	2
917	1	1	0	0	0	0	1	0	0	1	1	0	26	3
918	1	1	0	0	0	1	0	0	0	1	1	0	29	1
919	1	1	0	0	0	0	0	0	1	0	1	0	27	4
920	1	1	0	0	0	0	0	0	1	1	1	0	25	4
921	1	1	0	0	0	0	0	0	1	0	1	0	25	4

using DataFrames, Statistics

# Function to get descriptive statistics
function get_descriptive_stats(group::DataFrame, column::Symbol)
    if column == :age
        count_val = count(!ismissing, group[:, column])
    else
        count_val = sum(group[:, column] .== 1)
    end
    mean_val = mean(group[:, column])
    std_val = std(group[:, column])
    return count_val, mean_val, std_val
end

get_descriptive_stats (generic function with 1 method)

variables = setdiff(Symbol.(names(df)), [:w, :y])
control_stats = Dict(var => get_descriptive_stats(control, var) for var in variables)
treatment_stats = Dict(var => get_descriptive_stats(treatment, var) for var in variables)

# Convert the dictionary to a DataFrame
control_df = DataFrame(
    Variable = collect(keys(control_stats)),
    Count = [val[1] for val in values(control_stats)],
    Mean = [val[2] for val in values(control_stats)],
    Std = [val[3] for val in values(control_stats)]
)
treatment_df = DataFrame(
    Variable = collect(keys(treatment_stats)),
    Count = [val[1] for val in values(treatment_stats)],
    Mean = [val[2] for val in values(treatment_stats)],
    Std = [val[3] for val in values(treatment_stats)]
)
control_df = sort!(control_df, :Variable)
treatment_df = sort!(treatment_df, :Variable)

13×4 DataFrame

Row	Variable	Count	Mean	Std
	Symbol	Int64	Float64	Float64
1	age	921	23.1585	3.53874
2	ethnicgrp_asian	66	0.0716612	0.258066
3	ethnicgrp_black	74	0.0803474	0.271978
4	ethnicgrp_mixed_multiple	78	0.0846906	0.278572
5	ethnicgrp_other	9	0.00977199	0.0984226
6	ethnicgrp_white	694	0.753529	0.43119
7	gender_female	541	0.587405	0.492569
8	gender_male	377	0.409338	0.491979
9	gender_transgender	3	0.00325733	0.0570109
10	imd_decile	36	3.46037	1.46584
11	msm	114	0.123779	0.329508
12	partners1	277	0.30076	0.458838
13	postlaunch	512	0.555917	0.497133

# Combine control_df and treatment_df into a single DataFrame
combined_df = DataFrame(
    Variable = control_df.Variable,
    Control_Count = control_df.Count,
    Control_Mean = control_df.Mean,
    Control_Std = control_df.Std,
    Treatment_Count = treatment_df.Count,
    Treatment_Mean = treatment_df.Mean,
    Treatment_Std = treatment_df.Std
)

# Round numerical columns to 2 decimal places
function round_df(df::DataFrame, decimals::Int)
    rounded_df = copy(df)
    for col in names(df)[2:end]
        rounded_df[!, col] = round.(df[!, col], digits=decimals)
    end
    return rounded_df
end

formatted_table = round_df(combined_df, 2)

# Print the formatted table
println("Table 1: Descriptive Statistics and Balance\n")
show(stdout, "text/plain", formatted_table)

Table 1: Descriptive Statistics and Balance

13×7 DataFrame
 Row │ Variable                  Control_Count  Control_Mean  Control_Std  Treatment_Count  Treatment_Mean  Treatment_Std 
     │ Symbol                    Float64        Float64       Float64      Float64          Float64         Float64       
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
│ age                               818.0         23.05         3.59            921.0           23.16           3.54
│ ethnicgrp_asian                    45.0          0.06         0.23             66.0            0.07           0.26
│ ethnicgrp_black                    76.0          0.09         0.29             74.0            0.08           0.27
│ ethnicgrp_mixed_multiple           76.0          0.09         0.29             78.0            0.08           0.28
│ ethnicgrp_other                    14.0          0.02         0.13              9.0            0.01           0.1
│ ethnicgrp_white                   607.0          0.74         0.44            694.0            0.75           0.43
│ gender_female                     475.0          0.58         0.49            541.0            0.59           0.49
│ gender_male                       342.0          0.42         0.49            377.0            0.41           0.49
│ gender_transgender                  1.0          0.0          0.03              3.0            0.0            0.06
│ imd_decile                         37.0          3.48         1.49             36.0            3.46           1.47
│ msm                               113.0          0.14         0.35            114.0            0.12           0.33
│ partners1                         239.0          0.29         0.46            277.0            0.3            0.46
│ postlaunch                        387.0          0.47         0.5             512.0            0.56           0.5

The observations for each variable are generally balanced between the control and treatment groups. Additionally, most participants are white, with an average age of approximately 23. The mean IMD decile scores are around 3.5, indicating that participants in both groups tend to come from more deprived areas.

using DataFrames
using StatsPlots

# Group by 'w' and 'gender_male' and count occurrences
df_grouped = combine(groupby(df, [:w, :gender_male]), nrow => :count)

# Calculate the proportion of each group
df_grouped[!, :prop] .= df_grouped.count ./ sum(df_grouped.count)

df_grouped

4×4 DataFrame

Row	w	gender_male	count	prop
	Int64	Int64	Int64	Float64
1	0	0	476	0.273721
2	0	1	342	0.196665
3	1	0	544	0.312823
4	1	1	377	0.216791

using DataFrames
using Plots

# Extracting the data
w = df_grouped.w
gender_male = df_grouped.gender_male
prop = df_grouped.prop

# Creating a grouped bar plot
groupedbar(
    string.(gender_male),
    prop,
    group = string.(w),
    xlabel = "Gender Male (0 = Female, 1 = Male)",
    ylabel = "Proportion",
    title = "Grouped Bar Graph of Proportion by w and Gender",
    legend = :topright,
    bar_width = 0.5,
    label = ["Treatment = 0" "Treatment = 1"]
)

# Display the plot
plot!(current())

using DataFrames
using StatsPlots

# Group by 'w' and 'partners1' and count occurrences
df_grouped = combine(groupby(df, [:w, :partners1]), nrow => :count)

# Calculate the proportion of each group
df_grouped[!, :prop] .= df_grouped.count ./ sum(df_grouped.count)

# Extracting the data
w = df_grouped.w
partners1 = df_grouped.partners1
prop = df_grouped.prop

# Creating a grouped bar plot
groupedbar(
    string.(partners1),
    prop,
    group = string.(w),
    xlabel = "Number of partners (one partner=1)",
    ylabel = "Proportion",
    title = "Proportion of Partners by Treatment Group",
    legend = :topright,
    bar_width = 0.5,
    label = ["Treatment = 0" "Treatment = 1"]
)

# Display the plot
plot!(current())

using DataFrames
using StatsPlots

# Create density plots for each group in 'w'
density(
    df.age,
    group = df.w,
    xlabel = "Age",
    ylabel = "Percentage",
    title = "Density Plot of Age by Treatment Group",
    legend = :topright,
    normalize = true,
    label = ["Control" "Treatment"]
)

# Display the plot
plot!(current())

using DataFrames
using StatsPlots

# Group by 'imd_decile' and 'w' and count occurrences
df_grouped = combine(groupby(df, [:imd_decile, :w]), nrow => :count)

# Calculate the proportion of each group within 'imd_decile'
df_grouped[!, :prop] .= df_grouped.count ./ sum(df_grouped.count) * 100  # Convert to percentage

# Create a density-like bar plot
@df df_grouped groupedbar(
    :imd_decile,
    :prop,
    group = :w,
    xlabel = "IMD Decile",
    ylabel = "Percentage",
    title = "Density Plot of IMD Decile by Treatment Group",
    legend = :topright,
    bar_width = 0.8,
    label = ["Control" "Treatment"]
)

# Display the plot
plot!(current())

using DataFrames
using StatsModels
using GLM

model_1 = @formula(y ~ w)
est_1 = lm(model_1, df)

print(est_1)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{

Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix

{Float64}, Vector{Int64}}}}, Matrix{Float64}}

y ~ 1 + w

Coefficients:

────────────────────────────────────────────────────────────────────────
                Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%

────────────────────────────────────────────────────────────────────────

(Intercept)  0.211491   0.0160531

13.17    <1e-37   0.180006   0.242977

w            0.265164

   0.0220586  12.02    <1e-31

   0.2219     0.308429

────────────────────────────────────────────────────────────────────────

model_2 = @formula(y ~ w + age + gender_female + ethnicgrp_white + ethnicgrp_black + ethnicgrp_mixed_multiple + partners1 + postlaunch + imd_decile)
est_2 = lm(model_2, df)

print(est_2)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}},

GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}

, Vector{Int64}}}}, Matrix{Float64}}

y ~ 1 + w + age + gender_female + ethnicgrp_white + ethnicgrp_black + ethnicgrp_mixed_multiple + partners1 + postlaunch + imd_decile

Coefficients:

───────────────────────────────────────────────────────────────────────────────────────────
                                Coef.  Std. Error      t  Pr(>|t|)   Lower 95%    Upper 95%

───────────────────────────────────────────────────────────────────────────────────────────
(Intercept)               -0.163972    0.0880134   -1.86  

  0.0626  -0.336596    0.00865184
w                          0.255827    0.0217719   11.75    <1e-29   0.213125    0.298529
age                        0.0124373   0.00316801   3.93    <1e-04   0.0062238   0.0186509
gender_female              0.0928923   0.0223094    4.16    <1e-04   0.0491361   0.136649
ethnicgrp_white            0.0498876   0.0411584    1.21    0.2256  -0.0308379   0.130613
ethnicgrp_black           -0.0397451   0.0541118   -0.73    0.4627  -0.145877    0.0663863
ethnicgrp_mixed_multiple  -0.035726    0.0534103   -0.67    0.5037  -0.140481    0.0690296
partners1                 -0.0590884   0.0242699   -2.43    0.0150  -0.10669    -0.0114868
postlaunch                 0.0770255   0.0225539    3.42    0.0007   0.0327897   0.121261
imd_decile                -0.00410827  0.0074161   -0.55    0.5797  -0.0186537   0.0104372
───────────────────────────────────────────────────────────────────────────────────────────

3. Non-Linear Methods DML#

using Pkg
Pkg.add("DataFrames")
Pkg.add("CSV")
Pkg.add("Statistics")
Pkg.add("StatsModels")
Pkg.add("GLM")
Pkg.add("MLJ")
Pkg.add("XGBoost")
Pkg.add("MLJXGBoostInterface")
Pkg.add("MLJGLMInterface")
Pkg.add("Lasso")
Pkg.add("GLMNet")
Pkg.add("CovarianceMatrices")

   Resolving package versions...

  No Changes to `C:\Users\Matias Villalba\.julia\environments\v1.10\Project.toml`

  No Changes to `C:\Users\Matias Villalba\.julia\environments\v1.10\Manifest.toml`

Precompiling

 project...

using DataFrames
using CSV
using Statistics
using StatsModels
using GLM
using Random
using StatsBase
using LinearAlgebra

Precompiling CSV
  ✓ CSV
  1 dependency successfully precompiled in 9 seconds. 26 already precompiled.
Precompiling StatsModels
  ? StatsFuns
        Info Given StatsModels was explicitly requested, output will be shown live 
WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
  ? StatsModels
[ Info: Precompiling StatsModels [3eaba693-59b7-5ba5-a881-562e759f1c8d]
WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
[ Info: Skipping precompilation since __precompile__(false). Importing StatsModels [3eaba693-59b7-5ba5-a881-562e759f1c8d].
Precompiling StatsFuns
        Info Given StatsFuns was explicitly requested, output will be shown live 
WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
  ? StatsFuns
[ Info: Precompiling StatsFuns [4c63d2b9-4356-54db-8cca-17b64c39e42c]
WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
[ Info: Skipping precompilation since __precompile__(false). Importing StatsFuns [4c63d2b9-4356-54db-8cca-17b64c39e42c].
Precompiling GLM
  ? StatsFuns
  ? Distributions
  ? StatsModels
        Info Given GLM was explicitly requested, output will be shown live 
WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
  ? GLM
[ Info: Precompiling GLM [38e38edf-8417-5370-95a0-9cbb8c7f171a]
┌ Warning: Module StatsFuns with build ID ffffffff-ffff-ffff-0001-616a4687cd8a is missing from the cache.
│ This may mean StatsFuns [4c63d2b9-4356-54db-8cca-17b64c39e42c] does not support precompilation but is imported by a module that does.
└ @ Base loading.jl:1948
[ Info: Skipping precompilation since __precompile__(false). Importing GLM [38e38edf-8417-5370-95a0-9cbb8c7f171a].
Precompiling Distributions
  ? StatsFuns
        Info Given Distributions was explicitly requested, output will be shown live 
WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
  ? Distributions
[ Info: Precompiling Distributions [31c24e10-a181-5473-b8eb-7969acd0382f]
┌ Warning: Module StatsFuns with build ID ffffffff-ffff-ffff-0001-616a4687cd8a is missing from the cache.
│ This may mean StatsFuns [4c63d2b9-4356-54db-8cca-17b64c39e42c] does not support precompilation but is imported by a module that does.
└ @ Base loading.jl:1948
[ Info: Skipping precompilation since __precompile__(false). Importing Distributions [31c24e10-a181-5473-b8eb-7969acd0382f].

# Load the CSV file into a DataFrame
file_path = "C:\\Users\\juanl\\OneDrive\\Desktop\\hg\\data\\processed_esti.csv"
DML = CSV.read(file_path, DataFrame)

Random.seed!(1234)

# Split the data into training and testing sets
n = nrow(DML)
training = sample(1:n, round(Int, 0.75 * n), replace=false)
data_train = DML[training, :]
data_test = DML[setdiff(1:n, training), :]

# Extract the test outcome
Y_test = data_test.y

# Extract matrices for outcome, treatment, and controls
y = reshape(data_train[:, 1], :, 1)         # outcome: growth rate
d = reshape(data_train[:, 2], :, 1)         # treatment: initial wealth
x = Matrix(data_train[:, Not([1, 2])])      # controls: country characteristics

# Display the first few rows to verify
println("First few rows of y:")
println(y[1:5, :])
println("First few rows of d:")
println(d[1:5, :])
println("First few rows of x:")
println(x[1:5, :])

First few rows of y:
[0; 0; 0; 0; 1;;]
First few rows of d:
[1; 1; 0; 1; 1;;]
First few rows of x:
[1 0 0 0 0 0 0 1 1 0 0 25 6; 0 1 0 0 0 0 0 1 0 0 1 28 5; 1 0 0 0 0 0 0 1 1 1 0 17 6; 1 0 0 0 0 0 0 1 0 1 0 25 2; 1 0 0 0 0 0 0 1 0 0 0 23 2]

using Pkg
Pkg.add("DecisionTree")

using DecisionTree

   Resolving package versions...
   Installed ScikitLearnBase ─ v0.5.0
   Installed DecisionTree ──── v0.12.4
    Updating `C:\Users\juanl\.julia\environments\v1.10\Project.toml`
  [7806a523] + DecisionTree v0.12.4
    Updating `C:\Users\juanl\.julia\environments\v1.10\Manifest.toml`
  [7806a523] + DecisionTree v0.12.4
  [6e75b9c4] + ScikitLearnBase v0.5.0
Precompiling project...
  ? StatsFuns
  ✓ ScikitLearnBase
  ✓ ColorVectorSpace → SpecialFunctionsExt
  ✗ PyCall
  ✗ BinaryProvider
  ✓ Latexify → DataFramesExt
  ✗ MLJXGBoostInterface
  ✓ DecisionTree
  ✗ Snappy
  ? StatsModels
  ? Distributions
  ✗ ExcelReaders
  ✗ Parquet
  ? GLM
  ? CovarianceMatrices
  ✗ ExcelFiles
  ? GLMNet
  ✗ ParquetFiles
  ? MLJGLMInterface
  ? MLJBase
  ? Lasso
  ✗ Queryverse
  ? MLJIteration
  ? MLJModels
  ? MLJTuning
  ? MLJSerialization
  ? MLJEnsembles
  ? MLJ
  4 dependencies successfully precompiled in 42 seconds. 284 already precompiled.
  12 dependencies precompiled but different versions are currently loaded. Restart julia to access the new versions
  15 dependencies failed but may be precompilable after restarting julia
  15 dependencies had output during precompilation:
┌ StatsFuns
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ GLMNet
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJ
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJIteration
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJGLMInterface
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ CovarianceMatrices
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ Lasso
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ StatsModels
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJTuning
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJBase
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJModels
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJEnsembles
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ GLM
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ MLJSerialization
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
┌ Distributions
│  WARNING: Method definition (::Type{Base.MPFR.BigFloat})(Base.Irrational{:twoπ}) in module IrrationalConstants at irrationals.jl:223 overwritten in module StatsFuns on the same line (check for duplicate calls to `include`).
│  ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.
└  
  9 dependencies errored.
  For a report of the errors see `julia> err`. To retry use `pkg> precompile`

DML function for Regression tree#

using DecisionTree
using Statistics

function DML2_for_PLM_tree(data_train, dreg, yreg; nfold=10)
    nobs = nrow(data_train)  # number of observations
    foldid = repeat(1:nfold, ceil(Int, nobs/nfold))[shuffle(1:nobs)]  # define fold indices
    I = [findall(foldid .== i) for i in 1:nfold]  # split observation indices into folds
    ytil = fill(NaN, nobs)
    dtil = fill(NaN, nobs)
    println("fold: ")

    for b in 1:nfold
        # Exclude the current fold for training
        datitanow = data_train[setdiff(1:nobs, I[b]), Not(:d)]
        datitanoy = data_train[setdiff(1:nobs, I[b]), Not(:y)]
        # Current fold for prediction
        datitanowpredict = data_train[I[b], Not(:d)]
        datitanoypredict = data_train[I[b], Not(:y)]

        # Fit models
        dfit = dreg(datitanoy)
        yfit = yreg(datitanow)
        # Make predictions
        dhat = predict(dfit, DataFrame(datitanoypredict, :auto))
        yhat = predict(yfit, DataFrame(datitanowpredict, :auto))
        # Record residuals
        dtil[I[b]] .= data_train[I[b], :d] .- dhat
        ytil[I[b]] .= data_train[I[b], :y] .- yhat
        print("$b ")
    end

    # Ensure ytil and dtil are column vectors
    ytil = reshape(ytil, :, 1)
    dtil = reshape(dtil, :, 1)

    # Regress one residual on the other
    rfit = lm(@formula(ytil ~ dtil), DataFrame(hcat(ytil, dtil), :auto))
    coef_est = coef(rfit)[2]
    se = sqrt(vcov(HC0, rfit)[2, 2])
    println("\ncoef (se) = $coef_est ($se)")

    return (coef_est = coef_est, se = se, dtil = dtil, ytil = ytil)
end

DML2_for_PLM_tree (generic function with 1 method)

DML function for Boosting Trees#

using MLJ
using XGBoost

function DML2_for_PLM_boosttree(data_train, dreg, yreg; nfold=10)
    nobs = nrow(data_train)  # number of observations
    foldid = repeat(1:nfold, ceil(Int, nobs/nfold))[shuffle(1:nobs)]  # define fold indices
    I = [findall(foldid .== i) for i in 1:nfold]  # split observation indices into folds
    ytil = fill(NaN, nobs)
    dtil = fill(NaN, nobs)
    println("fold: ")

    for b in 1:nfold
        # Exclude the current fold for training
        datitanow = data_train[setdiff(1:nobs, I[b]), Not(:d)]
        datitanoy = data_train[setdiff(1:nobs, I[b]), Not(:y)]
        # Current fold for prediction
        datitanowpredict = data_train[I[b], Not(:d)]
        datitanoypredict = data_train[I[b], Not(:y)]

        # Fit models
        dfit = dreg(datitanoy)
        best_boostt = fit!(dfit, verbosity=0)  # fit the model
        yfit = yreg(datitanow)
        best_boosty = fit!(yfit, verbosity=0)  # fit the model

        # Make predictions
        dhat = MLJ.predict(best_boostt, DataFrame(datitanoypredict, :auto))
        yhat = MLJ.predict(best_boosty, DataFrame(datitanowpredict, :auto))

        # Record residuals
        dtil[I[b]] .= data_train[I[b], :d] .- dhat
        ytil[I[b]] .= data_train[I[b], :y] .- yhat
        print("$b ")
    end

    # Ensure ytil and dtil are column vectors
    ytil = reshape(ytil, :, 1)
    dtil = reshape(dtil, :, 1)

    # Regress one residual on the other
    rfit = lm(@formula(ytil ~ dtil), DataFrame(hcat(ytil, dtil), :auto))
    coef_est = coef(rfit)[2]
    se = sqrt(vcov(HC0, rfit)[2, 2])
    println("\ncoef (se) = $coef_est ($se)")

    return (coef_est = coef_est, se = se, dtil = dtil, ytil = ytil)
end

DML2_for_PLM_boosttree (generic function with 1 method)

DML function for Lasso#

using DataFrames
using CSV
using GLM
using Random
using StatsBase
using GLMNet
using Statistics

function DML2_for_PLM(x, d, y, dreg, yreg; nfold=10)
    nobs = size(x, 1)  # number of observations
    foldid = repeat(1:nfold, ceil(Int, nobs/nfold))[shuffle(1:nobs)]  # define fold indices
    I = [findall(foldid .== i) for i in 1:nfold]  # split observation indices into folds
    ytil = fill(NaN, nobs)
    dtil = fill(NaN, nobs)
    println("fold: ")

    for b in 1:nfold
        # Exclude the current fold for training
        train_indices = setdiff(1:nobs, I[b])
        test_indices = I[b]

        # Fit models
        dfit = dreg(x[train_indices, :], d[train_indices])
        yfit = yreg(x[train_indices, :], y[train_indices])

        # Make predictions
        dhat = GLMNet.predict(dfit, x[test_indices, :])
        yhat = GLMNet.predict(yfit, x[test_indices, :])

        # Record residuals
        dtil[test_indices] .= d[test_indices] .- dhat
        ytil[test_indices] .= y[test_indices] .- yhat
        print("$b ")
    end

    # Ensure ytil and dtil are column vectors
    ytil = vec(ytil)
    dtil = vec(dtil)

    # Regress one residual on the other
    data = DataFrame(ytil = ytil, dtil = dtil)
    rfit = lm(@formula(ytil ~ dtil), data)
    coef_est = coef(rfit)[2]
    se = sqrt(vcov(HC0, rfit)[2, 2])
    println("\ncoef (se) = $coef_est ($se)")

    return Dict("coef.est" => coef_est, "se" => se, "dtil" => dtil, "ytil" => ytil)
end

DML2_for_PLM (generic function with 1 method)

3.1. Lasso#

# Define Lasso regression functions
dreg_lasso = (x, d) -> GLMNet.glmnet(x, d, alpha=1.0, lambda=[0.1])
yreg_lasso = (x, y) -> GLMNet.glmnet(x, y, alpha=1.0, lambda=[0.1])

# Apply DML with Lasso
DML2_lasso = DML2_for_PLM(x, d, y, dreg_lasso, yreg_lasso, nfold=10)

# Extract results
coef_lasso = DML2_lasso["coef.est"]
se_lasso = DML2_lasso["se"]
prRes_lassoD = mean((DML2_lasso["dtil"]).^2)
prRes_lassoY = mean((DML2_lasso["ytil"]).^2)

# Format results
prRes_lasso = DataFrame(
    Estimate = coef_lasso,
    `Standard Error` = se_lasso,
    `RMSE D` = sqrt(prRes_lassoD),
    `RMSE Y` = sqrt(prRes_lassoY)
)

# Display the results
println(prRes_lasso)

fold: 
1 2 3 4 5 6 7 8 9 10 

UndefVarError: `coef` not defined

Stacktrace:

 [1] DML2_for_PLM(x::Matrix{Float64}, d::Vector{Float64}, y::Vector{Float64}, dreg::var"#18#19", yreg::var"#20#21"; nfold::Int64)

   @ Main .\In[8]:43

 [2] top-level scope

   @ In[9]:8

The message treatment providing information about Internet-accessed sexually transmitted infection testing predicts an increase in the probability that a person will get tested by 25.13 percentage points compared to receiving information about nearby clinics offering in-person testing. By providing both groups with information about testing, we mitigate the potential reminder effect, as both groups are equally prompted to consider testing. This approach allows us to isolate the impact of the type of information “Internet-accessed testing” versus “in-person clinic testing” on the likelihood of getting tested. Through randomized assignment, we establish causality rather than mere correlation, confirming that the intervention’s effect is driven by the unique advantages of Internet-accessed testing, such as increased privacy, reduced embarrassment, and convenience

3.2. Regression Trees#

# Set up the basic formula for the regression tree
X_basic = "gender_transgender + ethnicgrp_asian + ethnicgrp_black + ethnicgrp_mixed_multiple+ ethnicgrp_other + ethnicgrp_white + partners1 + postlaunch + msm + age+ imd_decile"
y_form_tree = @formula(y ~ $(Meta.parse(X_basic)))
t_form_tree = @formula(w ~ $(Meta.parse(X_basic)))

# Define tree regression functions
yreg_tree = (dataa) -> DecisionTreeRegressor(min_samples_leaf=5, max_depth=5).fit(Matrix(dataa[:, Not(:y)]), Vector(dataa[:, :y]))
treg_tree = (dataa) -> DecisionTreeRegressor(min_samples_leaf=5, max_depth=5).fit(Matrix(dataa[:, Not(:w)]), Vector(dataa[:, :w]))

# Apply DML with regression tree
DML2_tree = DML2_for_PLM_tree(data_train, treg_tree, yreg_tree, nfold=10)

# Extract results
coef_tree = DML2_tree[1]
se_tree = DML2_tree[2]
prRes_treeD = mean((DML2_tree[3]).^2)
prRes_treeY = mean((DML2_tree[4]).^2)

# Format results
prRes_tree = DataFrame(
    Estimate = coef_tree,
    `Standard Error` = se_tree,
    `RMSE D` = sqrt(prRes_treeD),
    `RMSE Y` = sqrt(prRes_treeY)
)

# Display the results
println(prRes_tree)

LoadError: ArgumentError: interpolation with $ not supported in @formula.  Use @eval @formula(...) instead.

in expression starting at In[23]:4

Stacktrace:

 [1] catch_dollar

   @ C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:20 [inlined]

 [2] parse!(ex::Expr, rewrites::Vector{DataType})

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:180

 [3] parse!(ex::Expr, rewrites::Vector{DataType})

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:197

 [4] parse!(x::Expr)

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:176

 [5] var"@formula"(__source__::LineNumberNode, __module__::Module, ex::Any)

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:63

The message treatment providing information about Internet-accessed sexually transmitted infection testing predicts an increase in the probability that a person will get tested by 23.08 percentage points compared to receiving information about nearby clinics offering in-person testing. By providing both groups with information about testing, we mitigate the potential reminder effect, as both groups are equally prompted to consider testing. This approach allows us to isolate the impact of the type of information “Internet-accessed testing” versus “in-person clinic testing” on the likelihood of getting tested. Through randomized assignment, we establish causality rather than mere correlation, confirming that the intervention’s effect is driven by the unique advantages of Internet-accessed testing, such as increased privacy, reduced embarrassment, and convenience

3.3. Boosting Trees#

# Set up the basic formula for the boosted tree
X_basic = "gender_transgender + ethnicgrp_asian + ethnicgrp_black + ethnicgrp_mixed_multiple+ ethnicgrp_other + ethnicgrp_white + partners1 + postlaunch + msm + age+ imd_decile"
y_form_tree = @formula(y ~ $(Meta.parse(X_basic)))
t_form_tree = @formula(w ~ $(Meta.parse(X_basic)))

# Define boosted tree regression functions
yreg_treeboost = (dataa) -> machine(XGBoostRegressor(max_depth=2, nrounds=1000, eta=0.01, subsample=0.5), dataa[:, Not(:y)], dataa[:, :y])
treg_treeboost = (dataa) -> machine(XGBoostRegressor(max_depth=2, nrounds=1000, eta=0.01, subsample=0.5), dataa[:, Not(:w)], dataa[:, :w])

# Apply DML with boosted trees
DML2_boosttree = DML2_for_PLM_boosttree(data_train, treg_treeboost, yreg_treeboost, nfold=10)

# Extract results
coef_boosttree = DML2_boosttree[1]
se_boosttree = DML2_boosttree[2]
prRes_boosttreeD = mean((DML2_boosttree[3]).^2)
prRes_boosttreeY = mean((DML2_boosttree[4]).^2)

# Format results
prRes_boosttree = DataFrame(
    Estimate = coef_boosttree,
    `Standard Error` = se_boosttree,
    `RMSE D` = sqrt(prRes_boosttreeD),
    `RMSE Y` = sqrt(prRes_boosttreeY)
)

# Display the results
println(prRes_boosttree)

LoadError: ArgumentError: interpolation with $ not supported in @formula.  Use @eval @formula(...) instead.

in expression starting at In[25]:3

Stacktrace:

 [1] catch_dollar

   @ C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:20 [inlined]

 [2] parse!(ex::Expr, rewrites::Vector{DataType})

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:180

 [3] parse!(ex::Expr, rewrites::Vector{DataType})

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:197

 [4] parse!(x::Expr)

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:176

 [5] var"@formula"(__source__::LineNumberNode, __module__::Module, ex::Any)

   @ StatsModels C:\Users\juanl\.julia\packages\StatsModels\fK0P3\src\formula.jl:63

The message treatment providing information about Internet-accessed sexually transmitted infection testing predicts an increase in the probability that a person will get tested by 25.28 percentage points compared to receiving information about nearby clinics offering in-person testing. By providing both groups with information about testing, we mitigate the potential reminder effect, as both groups are equally prompted to consider testing. This approach allows us to isolate the impact of the type of information “Internet-accessed testing” versus “in-person clinic testing” on the likelihood of getting tested. Through randomized assignment, we establish causality rather than mere correlation, confirming that the intervention’s effect is driven by the unique advantages of Internet-accessed testing, such as increased privacy, reduced embarrassment, and convenience

3.4. Random Forest#

# Function for Double Machine Learning with Partially Linear Model using Random Forest
function DML2_for_PLM_RF(data_train, dreg, yreg; nfold=10)
    nobs = nrow(data_train)  # number of observations
    foldid = repeat(1:nfold, ceil(Int, nobs/nfold))[shuffle(1:nobs)]  # define fold indices
    I = [findall(foldid .== i) for i in 1:nfold]  # split observation indices into folds
    ytil = fill(NaN, nobs)
    dtil = fill(NaN, nobs)
    println("fold: ")

    for b in 1:nfold
        # Exclude the current fold for training
        train_indices = setdiff(1:nobs, I[b])
        test_indices = I[b]

        # Fit models
        dfit = dreg(x[train_indices, :], d[train_indices])
        yfit = yreg(x[train_indices, :], y[train_indices])

        # Make predictions
        dhat = predict(dfit, x[test_indices, :])
        yhat = predict(yfit, x[test_indices, :])

        # Record residuals
        dtil[test_indices] .= d[test_indices] .- dhat
        ytil[test_indices] .= y[test_indices] .- yhat
        print("$b ")
    end

    # Ensure ytil and dtil are column vectors
    ytil = vec(ytil)
    dtil = vec(dtil)

    # Regress one residual on the other
    data = DataFrame(ytil = ytil, dtil = dtil)
    rfit = lm(@formula(ytil ~ dtil), data)
    coef_est = coef(rfit)[2]
    se = sqrt(vcov(HC0, rfit)[2, 2])
    println("\ncoef (se) = $coef_est ($se)")

    return Dict("coef.est" => coef_est, "se" => se, "dtil" => dtil, "ytil" => ytil)
end

DML2_for_PLM_RF (generic function with 1 method)

# Define Random Forest regression functions
dreg_RF = (x, d) -> RandomForestRegressor(n_trees=100).fit(x, d)
yreg_RF = (x, y) -> RandomForestRegressor(n_trees=100).fit(x, y)

# Apply DML with Random Forest
DML2_RF = DML2_for_PLM_RF(x, d, y, dreg_RF, yreg_RF, nfold=10)

# Extract results
coef_RF = DML2_RF["coef.est"]
se_RF = DML2_RF["se"]
prRes_RFD = mean((DML2_RF["dtil"]).^2)
prRes_RFY = mean((DML2_RF["ytil"]).^2)

# Format results
prRes_RF = DataFrame(
    Estimate = coef_RF,
    `Standard Error` = se_RF,
    `RMSE D` = sqrt(prRes_RFD),
    `RMSE Y` = sqrt(prRes_RFY)
)

# Display the results
println(prRes_RF)

MethodError: no method matching DML2_for_PLM_RF(::Matrix{Float64}, ::Vector{Float64}, ::Vector{Float64}, ::var"#55#56", ::var"#57#58"; nfold::Int64)

Closest candidates are:

  DML2_for_PLM_RF(::Any, ::Any, ::Any; nfold)

   @ Main In[26]:2

Stacktrace:

 [1] top-level scope

   @ In[27]:7

The message treatment providing information about Internet-accessed sexually transmitted infection testing predicts an increase in the probability that a person will get tested by 24.14 percentage points compared to receiving information about nearby clinics offering in-person testing. By providing both groups with information about testing, we mitigate the potential reminder effect, as both groups are equally prompted to consider testing. This approach allows us to isolate the impact of the type of information “Internet-accessed testing” versus “in-person clinic testing” on the likelihood of getting tested. Through randomized assignment, we establish causality rather than mere correlation, confirming that the intervention’s effect is driven by the unique advantages of Internet-accessed testing, such as increased privacy, reduced embarrassment, and convenience

3.5. Table and Coefficient plot#

Table#

Coefficient Plot#

prRes_D = [
    mean((DML2_lasso["dtil"]).^2),
    mean((DML2_tree["dtil"]).^2),
    mean((DML2_boosttree["dtil"]).^2),
    mean((DML2_RF["dtil"]).^2)
]

prRes_Y = [
    mean((DML2_lasso["ytil"]).^2),
    mean((DML2_tree["ytil"]).^2),
    mean((DML2_boosttree["ytil"]).^2),
    mean((DML2_RF["ytil"]).^2)
]

prRes = hcat(sqrt.(prRes_D), sqrt.(prRes_Y))
rownames!(prRes, ["RMSE D", "RMSE Y"])
colnames!(prRes, ["Lasso", "Reg Tree", "Boost Tree", "Random Forest"])

# Create results table
table = zeros(4, 4)

# Point Estimate
table[1, 1] = DML2_lasso["coef.est"]
table[2, 1] = DML2_tree["coef.est"]
table[3, 1] = DML2_boosttree["coef.est"]
table[4, 1] = DML2_RF["coef.est"]

# SE
table[1, 2] = DML2_lasso["se"]
table[2, 2] = DML2_tree["se"]
table[3, 2] = DML2_boosttree["se"]
table[4, 2] = DML2_RF["se"]

# RMSE Y
table[1, 3] = prRes[2, 1]
table[2, 3] = prRes[2, 2]
table[3, 3] = prRes[2, 3]
table[4, 3] = prRes[2, 4]

# RMSE D
table[1, 4] = prRes[1, 1]
table[2, 4] = prRes[1, 2]
table[3, 4] = prRes[1, 3]
table[4, 4] = prRes[1, 4]

# Print results
colnames!(table, ["Estimate", "Standard Error", "RMSE Y", "RMSE D"])
rownames!(table, ["Lasso", "Reg Tree", "Boost Tree", "Random Forest"])

# Display the table
println(table)

UndefVarError: `DML2_lasso` not defined

Stacktrace:

 [1] top-level scope

   @ In[28]:2

using Pkg
Pkg.add("Plots")
using plots

   Resolving package versions...
  No Changes to `C:\Users\juanl\.julia\environments\v1.10\Project.toml`
  No Changes to `C:\Users\juanl\.julia\environments\v1.10\Manifest.toml`
Precompiling project...
  ? StatsFuns
  ? StatsModels
  ? Distributions
  ? GLM
  ? CovarianceMatrices
  ? GLMNet
  ? MLJGLMInterface
  ? MLJBase
  ? Lasso
  ? MLJIteration
  ? MLJTuning
  ? MLJEnsembles
  ? MLJSerialization
  ? MLJModels
  ? MLJ

ArgumentError: Package plots not found in current path.

- Run `import Pkg; Pkg.add("plots")` to install the plots package.

Stacktrace:

 [1] macro expansion

   @ .\loading.jl:1772 [inlined]

 [2] macro expansion

   @ .\lock.jl:267 [inlined]

 [3] __require(into::Module, mod::Symbol)

   @ Base .\loading.jl:1753

 [4] #invoke_in_world#3

   @ .\essentials.jl:926 [inlined]

 [5] invoke_in_world

   @ .\essentials.jl:923 [inlined]

 [6] require(into::Module, mod::Symbol)

   @ Base .\loading.jl:1746

# Convert `table` to DataFrame
table_ci = DataFrame(table, ["Estimate", "Standard Error", "RMSE Y", "RMSE D"])
table_ci.Method = ["Lasso", "Reg Tree", "Boost Tree", "Random Forest"]

# Calculate confidence intervals
table_ci.CI_Lower_1 = table_ci.Estimate .- 2.576 .* table_ci."Standard Error"
table_ci.CI_Upper_1 = table_ci.Estimate .+ 2.576 .* table_ci."Standard Error"
table_ci.CI_Lower_5 = table_ci.Estimate .- 1.96 .* table_ci."Standard Error"
table_ci.CI_Upper_5 = table_ci.Estimate .+ 1.96 .* table_ci."Standard Error"
table_ci.CI_Lower_10 = table_ci.Estimate .- 1.645 .* table_ci."Standard Error"
table_ci.CI_Upper_10 = table_ci.Estimate .+ 1.645 .* table_ci."Standard Error"

# Plotting
plot(table_ci.Method, table_ci.Estimate, seriestype=:scatter, label="Estimate", markersize=4)
plot!(table_ci.Method, table_ci.CI_Lower_5, ribbon=(table_ci.CI_Upper_5 .- table_ci.CI_Lower_5), label="95% CI", lw=2, lc=:blue, alpha=0.7)
plot!(table_ci.Method, table_ci.CI_Lower_10, ribbon=(table_ci.CI_Upper_10 .- table_ci.CI_Lower_10), label="90% CI", lw=2, lc=:red, alpha=0.7)
plot!(table_ci.Method, table_ci.CI_Lower_1, ribbon=(table_ci.CI_Upper_1 .- table_ci.CI_Lower_1), label="99% CI", lw=2, lc=:green, alpha=0.7)
title!("Estimated Coefficients with Confidence Intervals")
xlabel!("Method")
ylabel!("Estimate")
theme(:minimal)
xticks!(1:4, table_ci.Method)
xrotation!(45)

MethodError: no method matching DataFrame(::typeof(table), ::Vector{String})

Closest candidates are:

  DataFrame(::AbstractVector{<:Type}, ::AbstractVector{<:AbstractString}, ::Integer; makeunique)

   @ DataFrames C:\Users\juanl\.julia\packages\DataFrames\JZ7x5\src\dataframe\dataframe.jl:397

  DataFrame(::AbstractVector{<:AbstractVector}, ::AbstractVector; makeunique, copycols)

   @ DataFrames C:\Users\juanl\.julia\packages\DataFrames\JZ7x5\src\dataframe\dataframe.jl:344

  DataFrame(::AbstractVector{<:Type}, ::AbstractVector{<:AbstractString}; ...)

   @ DataFrames C:\Users\juanl\.julia\packages\DataFrames\JZ7x5\src\dataframe\dataframe.jl:397

  ...

Stacktrace:

 [1] top-level scope

   @ In[37]:2

3.6. Model#

To choose the best model, we must compare the RMSEs of the outcome variable Y. In this case, the model with the lowest RMSE for Y is generated by Lasso (0.4716420), whereas the lowest for the treatment is generated by Boosting Trees (0.4983734). Therefore, DML could be employed with Y cleaned using Lasso and the treatment using Boosting Trees.