From 71e760c6aa55814574dbd0cc8234f04951356d1a Mon Sep 17 00:00:00 2001
From: Anshul Singhvi <anshulsinghvi@gmail.com>
Date: Tue, 21 Feb 2023 00:51:18 +0530
Subject: [PATCH 01/63] Update by.jl

---
 src/by.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/by.jl b/src/by.jl
index 3f955b15..f28de3b5 100644
--- a/src/by.jl
+++ b/src/by.jl
@@ -1,13 +1,12 @@
-function bydomain(x::Symbol, domain::Symbol, design::SurveyDesign, func::Function)
+function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function)
     gdf = groupby(design.data, domain)
-    nd = length(unique(design.data[!, domain]))
     X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
     return X
 end
 
-function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function)
+function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function)
     gdf = groupby(design.data, domain)
-    nd = length(unique(design.data[!, domain]))
+    nd = length(gdf)
     X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
     Xt_mat = Array{Float64,2}(undef, (nd, design.replicates))
     for i = 1:design.replicates

From 77f967f694417f51aae23d9e2824be5f30bef7a8 Mon Sep 17 00:00:00 2001
From: Anshul Singhvi <anshulsinghvi@gmail.com>
Date: Tue, 21 Feb 2023 00:51:57 +0530
Subject: [PATCH 02/63] Update mean.jl

---
 src/mean.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mean.jl b/src/mean.jl
index 4db9fed1..75fef801 100644
--- a/src/mean.jl
+++ b/src/mean.jl
@@ -130,7 +130,7 @@ julia> mean(:api00, :cname, bclus1)
   11 │ Mendocino    623.25    1.09545e-13
 ```
 """
-function mean(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
+function mean(x::Symbol, domain, design::AbstractSurveyDesign)
     weighted_mean(x, w) = mean(x, StatsBase.weights(w))
     df = bydomain(x, domain, design, weighted_mean)
     rename!(df, :statistic => :mean)

From 5a01fe9b22f14092aa9f00c6f4e74f58d0fd46eb Mon Sep 17 00:00:00 2001
From: Anshul Singhvi <anshulsinghvi@gmail.com>
Date: Tue, 21 Feb 2023 00:53:42 +0530
Subject: [PATCH 03/63] Update total.jl

---
 src/total.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/total.jl b/src/total.jl
index cc9bb06d..a451324c 100644
--- a/src/total.jl
+++ b/src/total.jl
@@ -120,7 +120,7 @@ julia> total(:api00, :cname, bclus1)
   11 │ Mendocino     84380.6        80215.9
 ```
 """
-function total(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
+function total(x::Symbol, domain, design::AbstractSurveyDesign)
     df = bydomain(x, domain, design, wsum)
     rename!(df, :statistic => :total)
 end

From 9139e6f7fd09ab46b1237abe1409564750efb8de Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Wed, 22 Feb 2023 14:53:30 +0530
Subject: [PATCH 04/63] Adding a default constructor and a constructor which
 takes `replicate_weights` for `ReplicateDesign`.

---
 src/SurveyDesign.jl | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index e0211352..4027c73f 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -157,4 +157,54 @@ struct ReplicateDesign <: AbstractSurveyDesign
     allprobs::Symbol # Right now only singlestage approx supported
     pps::Bool
     replicates::UInt
+    replicate_weights::Vector{Symbol}
+
+    # default constructor
+    function ReplicateDesign(
+        data::DataFrame,
+        cluster::Symbol,
+        popsize::Symbol,
+        sampsize::Symbol,
+        strata::Symbol,
+        weights::Symbol,
+        allprobs::Symbol,
+        pps::Bool,
+        replicates::UInt,
+        replicate_weights::Vector{Symbol}
+    )
+        new(data, cluster, popsize, sampsize, strata, weights, allprobs,
+           pps, replicates, replicate_weights)
+    end
+
+    # constructor with given replicate_weights
+    function ReplicateDesign(
+        data::AbstractDataFrame;
+        clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+        strata::Union{Nothing,Symbol} = nothing,
+        popsize::Union{Nothing,Symbol} = nothing,
+        weights::Union{Nothing,Symbol} = nothing,
+        replicates::UInt,
+        replicate_weights::Vector{Symbol}
+    )
+        # call the SurveyDesign constructor
+        base_design = SurveyDesign(
+                        data;
+                        clusters=clusters,
+                        strata=strata,
+                        popsize=popsize,
+                        weights=weights
+                      )
+        new(
+            base_design.data,
+            base_design.cluster,
+            base_design.popsize,
+            base_design.sampsize,
+            base_design.strata,
+            base_design.weights,
+            base_design.allprobs,
+            base_design.pps,
+            replicates,
+            replicate_weights
+        )
+    end
 end

From a4f68afad710cdb531580b04163c77c7115a5ba1 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Wed, 22 Feb 2023 14:54:32 +0530
Subject: [PATCH 05/63] Minor change in `bootweights`.

---
 src/bootstrap.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
index 7e01247f..2fa02df6 100644
--- a/src/bootstrap.jl
+++ b/src/bootstrap.jl
@@ -54,6 +54,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
         design.weights,
         design.allprobs,
         design.pps,
-        replicates,
+        UInt(replicates),
+        [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates]
     )
 end

From 5ea25251e4f351c544c75800a1c6d1345f7ca68d Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Thu, 23 Feb 2023 00:38:51 +0530
Subject: [PATCH 06/63] Modifying docstring for `ReplicateDesign`.

---
 src/SurveyDesign.jl | 49 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index 4027c73f..54e377e6 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -127,14 +127,32 @@ end
 """
     ReplicateDesign <: AbstractSurveyDesign
 
-Survey design obtained by replicating an original design using [`bootweights`](@ref).
+Survey design obtained by replicating an original design using [`bootweights`](@ref). If
+replicate weights are available, then they can be used to directly create a `ReplicateDesign`.
+
+# Constructors
+
+```julia
+ReplicateDesign(
+    data::AbstractDataFrame;
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing,
+    popsize::Union{Nothing,Symbol} = nothing,
+    weights::Union{Nothing,Symbol} = nothing,
+    replicates::UInt,
+    replicate_weights::Vector{Symbol}
+)
+```
+
+# Examples
+
+Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`.
 
 ```jldoctest
 julia> apistrat = load_data("apistrat");
 
 julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
 
-julia> bootstrat = bootweights(dstrat; replicates=1000)
+julia> bootstrat = bootweights(dstrat; replicates=1000)     # creating a ReplicateDesign using bootweights
 ReplicateDesign:
 data: 200×1044 DataFrame
 strata: stype
@@ -145,7 +163,34 @@ sampsize: [100, 100, 100  …  50]
 weights: [44.21, 44.21, 44.21  …  15.1]
 allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
 replicates: 1000
+
 ```
+
+If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in
+the above example, suppose we save the `bootstrat` data as a CSV file.
+
+```jldoctest
+julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
+```
+
+We can now pass the replicate weights directly to the `ReplicateDesign` constructor.
+
+```jldoctest
+julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame);
+
+julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv; strata=:stype, weights=:pw, replicates=UInt(1000), replicate_weights=[Symbol("replicate_"*string(replicate)) for replicate in 1:1000])
+ReplicateDesign:
+data: 200×1044 DataFrame
+strata: stype
+    [E, E, E  …  H]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+replicates: 1000
+```
+
 """
 struct ReplicateDesign <: AbstractSurveyDesign
     data::AbstractDataFrame

From a6573b1b70fb337764388a5c434560767128e24a Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 24 Feb 2023 14:57:01 +0530
Subject: [PATCH 07/63] Making `replicate_weights` a positional argument in
 `ReplicateDesign` constructor, and removing the redundant `replicates`
 argument.

---
 src/SurveyDesign.jl | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index 54e377e6..e0dc6b3b 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -134,12 +134,11 @@ replicate weights are available, then they can be used to directly create a `Rep
 
 ```julia
 ReplicateDesign(
-    data::AbstractDataFrame;
+    data::AbstractDataFrame,
+    replicate_weights::Vector{Symbol};
     clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing,
     popsize::Union{Nothing,Symbol} = nothing,
-    weights::Union{Nothing,Symbol} = nothing,
-    replicates::UInt,
-    replicate_weights::Vector{Symbol}
+    weights::Union{Nothing,Symbol} = nothing
 )
 ```
 
@@ -178,7 +177,7 @@ We can now pass the replicate weights directly to the `ReplicateDesign` construc
 ```jldoctest
 julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame);
 
-julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv; strata=:stype, weights=:pw, replicates=UInt(1000), replicate_weights=[Symbol("replicate_"*string(replicate)) for replicate in 1:1000])
+julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv, [Symbol("replicate_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)
 ReplicateDesign:
 data: 200×1044 DataFrame
 strata: stype
@@ -223,13 +222,12 @@ struct ReplicateDesign <: AbstractSurveyDesign
 
     # constructor with given replicate_weights
     function ReplicateDesign(
-        data::AbstractDataFrame;
+        data::AbstractDataFrame,
+        replicate_weights::Vector{Symbol};
         clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
         strata::Union{Nothing,Symbol} = nothing,
         popsize::Union{Nothing,Symbol} = nothing,
-        weights::Union{Nothing,Symbol} = nothing,
-        replicates::UInt,
-        replicate_weights::Vector{Symbol}
+        weights::Union{Nothing,Symbol} = nothing
     )
         # call the SurveyDesign constructor
         base_design = SurveyDesign(
@@ -248,7 +246,7 @@ struct ReplicateDesign <: AbstractSurveyDesign
             base_design.weights,
             base_design.allprobs,
             base_design.pps,
-            replicates,
+            length(replicate_weights),
             replicate_weights
         )
     end

From 3893699fea511ca6fc819f9648ee9af861ec3c26 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 24 Feb 2023 18:14:26 +0530
Subject: [PATCH 08/63] Minor changes to the `ReplicateDesign` docstring.

---
 src/SurveyDesign.jl | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index e0dc6b3b..ebbd4369 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -136,17 +136,24 @@ replicate weights are available, then they can be used to directly create a `Rep
 ReplicateDesign(
     data::AbstractDataFrame,
     replicate_weights::Vector{Symbol};
-    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing,
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+    strata::Union{Nothing,Symbol} = nothing,
     popsize::Union{Nothing,Symbol} = nothing,
     weights::Union{Nothing,Symbol} = nothing
 )
 ```
 
+# Arguments
+
+The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which
+must be a `Vector` of `Symbols`. The `Symbol`s should represent the columns of `data` which contain the replicate weights. The
+names of these columns must be of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weights.
+
 # Examples
 
 Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`.
 
-```jldoctest
+```jldoctest replicate-design
 julia> apistrat = load_data("apistrat");
 
 julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
@@ -166,15 +173,16 @@ replicates: 1000
 ```
 
 If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in
-the above example, suppose we save the `bootstrat` data as a CSV file.
+the above example, suppose we had the `bootstrat` data as a CSV file.
 
-```jldoctest
+```jldoctest replicate-design
+julia> using CSV;
 julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
 ```
 
 We can now pass the replicate weights directly to the `ReplicateDesign` constructor.
 
-```jldoctest
+```jldoctest replicate-design
 julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame);
 
 julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv, [Symbol("replicate_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)

From 25e12a8184878a4b22abfbdffea5faf2caf618fe Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 24 Feb 2023 18:59:54 +0530
Subject: [PATCH 09/63] Adding missing doc dependencies, and fixing error in
 docstring for `ReplicateDesign`.

---
 docs/Project.toml   | 4 +++-
 src/SurveyDesign.jl | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index 76d580a7..f11e8201 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,7 @@
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index ebbd4369..b4232ce7 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -153,7 +153,7 @@ names of these columns must be of the form `replicate_i`, where `i` ranges from
 
 Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`.
 
-```jldoctest replicate-design
+```jldoctest replicate-design; setup = :(using Survey, CSV, DataFrames)
 julia> apistrat = load_data("apistrat");
 
 julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
@@ -177,7 +177,9 @@ the above example, suppose we had the `bootstrat` data as a CSV file.
 
 ```jldoctest replicate-design
 julia> using CSV;
+
 julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
+
 ```
 
 We can now pass the replicate weights directly to the `ReplicateDesign` constructor.
@@ -196,6 +198,7 @@ sampsize: [100, 100, 100  …  50]
 weights: [44.21, 44.21, 44.21  …  15.1]
 allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
 replicates: 1000
+
 ```
 
 """

From be60c2b5d76a1e8e35baee573eaf7ac672fbb95f Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 24 Feb 2023 22:09:49 +0530
Subject: [PATCH 10/63] Adding tests for `ReplicateDesign` constructor.

---
 test/SurveyDesign.jl | 14 ++++++++++++++
 test/runtests.jl     |  5 +++++
 2 files changed, 19 insertions(+)

diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl
index 0d42fc19..2c18f48a 100644
--- a/test/SurveyDesign.jl
+++ b/test/SurveyDesign.jl
@@ -259,3 +259,17 @@ end
     yrbs = copy(yrbs_original)
     dyrbs = SurveyDesign(yrbs; clusters = :psu, strata = :stratum, weights = :weight)
 end
+
+@testset "ReplicateDesign_constructor" begin
+    for (sample, sample_direct) in [(bsrs, bsrs_direct), (bstrat, bstrat_direct), (dclus1_boot, dclus1_boot_direct)]
+        @test isequal(sample.data, sample_direct.data)
+        @test isequal(sample.popsize, sample_direct.popsize)
+        @test isequal(sample.sampsize, sample_direct.sampsize)
+        @test isequal(sample.strata, sample_direct.strata)
+        @test isequal(sample.weights, sample_direct.weights)
+        @test isequal(sample.allprobs, sample_direct.allprobs)
+        @test isequal(sample.pps, sample_direct.pps)
+        @test isequal(sample.replicates, sample_direct.replicates)
+        @test isequal(sample.replicate_weights, sample_direct.replicate_weights)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f2fc89d4..0485dbaa 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,21 +4,26 @@ using CategoricalArrays
 
 const STAT_TOL = 1e-5
 const SE_TOL = 1e-1
+const REPLICATE_WEIGHTS = [Symbol("replicate_"*string(i)) for i in 1:4000]
 
 # Simple random sample
 apisrs = load_data("apisrs") # Load API dataset
 srs = SurveyDesign(apisrs, weights = :pw)
 bsrs = srs |> bootweights # Create replicate design
+bsrs_direct = ReplicateDesign(bsrs.data, REPLICATE_WEIGHTS, weights = :pw)  # using ReplicateDesign constructor 
+
 # Stratified sample
 apistrat = load_data("apistrat") # Load API dataset
 dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign
 bstrat = dstrat |> bootweights # Create replicate design
+bstrat_direct = ReplicateDesign(bstrat.data, REPLICATE_WEIGHTS, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
 
 # One-stage cluster sample
 apiclus1 = load_data("apiclus1") # Load API dataset
 apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake for pw column
 dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus1_boot = dclus1 |> bootweights # Create replicate design
+dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATE_WEIGHTS, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
 
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)

From 2da49c557813c31f62cd0e82298310f4b6657e39 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 24 Feb 2023 22:54:54 +0530
Subject: [PATCH 11/63] Adding a `type` field to `ReplicateDesign`.

---
 src/SurveyDesign.jl | 5 ++++-
 src/bootstrap.jl    | 1 +
 src/show.jl         | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index b4232ce7..2feb68e9 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -211,6 +211,7 @@ struct ReplicateDesign <: AbstractSurveyDesign
     weights::Symbol # Effective weights in case of singlestage approx supported
     allprobs::Symbol # Right now only singlestage approx supported
     pps::Bool
+    type::String
     replicates::UInt
     replicate_weights::Vector{Symbol}
 
@@ -224,11 +225,12 @@ struct ReplicateDesign <: AbstractSurveyDesign
         weights::Symbol,
         allprobs::Symbol,
         pps::Bool,
+        type::String,
         replicates::UInt,
         replicate_weights::Vector{Symbol}
     )
         new(data, cluster, popsize, sampsize, strata, weights, allprobs,
-           pps, replicates, replicate_weights)
+           pps, type, replicates, replicate_weights)
     end
 
     # constructor with given replicate_weights
@@ -257,6 +259,7 @@ struct ReplicateDesign <: AbstractSurveyDesign
             base_design.weights,
             base_design.allprobs,
             base_design.pps,
+            "bootstrap",
             length(replicate_weights),
             replicate_weights
         )
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
index 2fa02df6..cc1941e8 100644
--- a/src/bootstrap.jl
+++ b/src/bootstrap.jl
@@ -54,6 +54,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
         design.weights,
         design.allprobs,
         design.pps,
+        "bootstrap",
         UInt(replicates),
         [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates]
     )
diff --git a/src/show.jl b/src/show.jl
index 2c323704..09d22b48 100644
--- a/src/show.jl
+++ b/src/show.jl
@@ -37,6 +37,7 @@ Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = surveyshow(io, des
 function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign)
     # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50))
     surveyshow(io, design)
+    printinfo(io, "\ntype", design.type; newline = false)
     printinfo(io, "\nreplicates", design.replicates; newline = false)
 end
 

From 409228ae0b1229495b1dc2861e4c045483b5b5f9 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 24 Feb 2023 23:08:02 +0530
Subject: [PATCH 12/63] Updating tests for `show.jl`.

---
 test/show.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/show.jl b/test/show.jl
index d21cb76f..3035190f 100644
--- a/test/show.jl
+++ b/test/show.jl
@@ -23,6 +23,7 @@
     sampsize: [200, 200, 200  …  200]
     weights: [30.97, 30.97, 30.97  …  30.97]
     allprobs: [0.0323, 0.0323, 0.0323  …  0.0323]
+    type: bootstrap
     replicates: 4000"""
 
     show(io, MIME("text/plain"), bsrs)
@@ -58,6 +59,7 @@ end
     sampsize: [100, 100, 100  …  50]
     weights: [44.21, 44.21, 44.21  …  15.1]
     allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+    type: bootstrap
     replicates: 4000"""
 
     show(io, MIME("text/plain"), bstrat)
@@ -93,6 +95,7 @@ end
     sampsize: [15, 15, 15  …  15]
     weights: [50.4667, 50.4667, 50.4667  …  50.4667]
     allprobs: [0.0198, 0.0198, 0.0198  …  0.0198]
+    type: bootstrap
     replicates: 4000"""
 
     show(io, MIME("text/plain"), dclus1_boot)

From d13c288cc32c169cb966919226db5cb464385b71 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Fri, 24 Feb 2023 23:54:01 +0530
Subject: [PATCH 13/63] [WIP] Updating contribution guidelines

---
 CONTRIBUTING.md | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ca099c87..9108e095 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -93,9 +93,7 @@ This way you are modifying as little as possible of previously written code, and
 * Codedev coverage has reached 100%. So please make sure to add testing cases of contributed code to keep it at 100%.
 * If you want to propose a new functionality it is strongly recommended to open an issue first and reach a decision on the final design.
   Then a pull request serves an implementation of the agreed way how things should work.
-* If you are a new contributor and would like to get a guidance on what area
-  you could focus your first PR please do not hesitate to ask and JuliaData members
-  will help you with picking a topic matching your experience.
+* If you are a new contributor and would like to get a guidance on what area you could focus your first PR please do not hesitate to ask and JuliaData members will help you with picking a topic matching your experience.
 * Feel free to open, or comment on, an issue and solicit feedback early on,
   especially if you're unsure about aligning with design goals and direction,
   or if relevant historical comments are ambiguous.
@@ -104,22 +102,14 @@ This way you are modifying as little as possible of previously written code, and
 * Aim for atomic commits, if possible, e.g. `change 'foo' behavior like so` &
   `'bar' handles such and such corner case`,
   rather than `update 'foo' and 'bar'` & `fix typo` & `fix 'bar' better`.
-* Pull requests are tested against release and development branches of Julia,
-  so using `Pkg.test("DataFrames")` as you develop can be helpful.
+* Pull requests are tested against release branches of Julia, so using `Pkg.test("Survey")` as you develop can be helpful.
 * The style guidelines outlined below are not the personal style of most contributors,
   but for consistency throughout the project, we've adopted them.
-* It is recommended to disable GitHub Actions on your fork; check Settings > Actions.
 * If a PR adds a new exported name then make sure to add a docstring for it and
   add a reference to it in the documentation.
 * A PR with breaking changes should have `[BREAKING]` as a first part of its name.
-* If a PR changes or adds functionality please update NEWS.md file accordingly as
-  a part of the PR (along with the link to the PR); please do not add entries
-  to NEWS.md for changes that are bug fixes or are not user visible, such as
-  adding tests, updating documentation or improving code layout.
-* If you make a PR please try to avoid pushing many small commits to GitHub in
-  a sequence as each such commit triggers a separate CI job, which takes over
-  an hour. This has a consequence of making other PRs in packages from the JuliaData
-  ecosystem wait for such CI jobs to finish as hey share a common pool of CI resources.
+* A PR which is still draft or work in progress should have `WIP:` as a first part of its name.
+* If you make a PR please try to avoid pushing many small commits to GitHub in a sequence as each such commit triggers a separate CI job, which takes compuational time, and not a good use of the small pool of CI resources.
 
 ## Style Guidelines
 

From 3c0f68a2f7b9eb6225468c2b7599a704cc94bcdd Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Sat, 25 Feb 2023 01:43:16 +0530
Subject: [PATCH 14/63] Fixing minor errors in documentation.

---
 README.md           | 5 +++--
 src/SurveyDesign.jl | 2 ++
 src/bootstrap.jl    | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 16aaac65..c2c673c5 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,8 @@ cluster: none
 popsize: [6190.0, 6190.0, 6190.0  …  6190.0]
 sampsize: [200, 200, 200  …  200]
 weights: [31.0, 31.0, 31.0  …  31.0]
-probs: [0.0323, 0.0323, 0.0323  …  0.0323]
+allprobs: [0.0323, 0.0323, 0.0323  …  0.0323]
+type: bootstrap
 replicates: 1000
 
 julia> mean(:api00, bootsrs)
@@ -171,4 +172,4 @@ We gratefully acknowledge the JuliaLab at MIT for financial support for this pro
 
 ## References
 
-[^1]: [Lumley, Thomas. Complex surveys: a guide to analysis using R. John Wiley & Sons, 2011.](https://books.google.co.in/books?hl=en&lr=&id=L96ludyhFBsC&oi=fnd&pg=PP12&dq=complex+surveys+lumley&ots=ie0y1lnzv1&sig=c4UHI3arjspMJ6OYzlX32E9rNRI#v=onepage&q=complex%20surveys%20lumley&f=false) Page 44
\ No newline at end of file
+[^1]: [Lumley, Thomas. Complex surveys: a guide to analysis using R. John Wiley & Sons, 2011.](https://books.google.co.in/books?hl=en&lr=&id=L96ludyhFBsC&oi=fnd&pg=PP12&dq=complex+surveys+lumley&ots=ie0y1lnzv1&sig=c4UHI3arjspMJ6OYzlX32E9rNRI#v=onepage&q=complex%20surveys%20lumley&f=false) Page 44
diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index 2feb68e9..43acee38 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -168,6 +168,7 @@ popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
 sampsize: [100, 100, 100  …  50]
 weights: [44.21, 44.21, 44.21  …  15.1]
 allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
 replicates: 1000
 
 ```
@@ -197,6 +198,7 @@ popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
 sampsize: [100, 100, 100  …  50]
 weights: [44.21, 44.21, 44.21  …  15.1]
 allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
 replicates: 1000
 
 ```
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
index cc1941e8..b789783c 100644
--- a/src/bootstrap.jl
+++ b/src/bootstrap.jl
@@ -18,6 +18,7 @@ popsize: [757, 757, 757  …  757]
 sampsize: [15, 15, 15  …  15]
 weights: [50.4667, 50.4667, 50.4667  …  50.4667]
 allprobs: [0.0198, 0.0198, 0.0198  …  0.0198]
+type: bootstrap
 replicates: 1000
 ```
 """

From 5efee45a62f34ba27ddc6219d58d6b81de74be8e Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Sat, 25 Feb 2023 16:48:12 +0530
Subject: [PATCH 15/63] add jk1 and jkn stratified

---
 src/Survey.jl     |  2 ++
 src/jackknife.jl  | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 test/jackknife.jl |  4 +++
 test/runtests.jl  |  1 +
 4 files changed, 89 insertions(+)
 create mode 100644 src/jackknife.jl
 create mode 100644 test/jackknife.jl

diff --git a/src/Survey.jl b/src/Survey.jl
index 62f263a2..884453c6 100644
--- a/src/Survey.jl
+++ b/src/Survey.jl
@@ -25,6 +25,7 @@ include("boxplot.jl")
 include("show.jl")
 include("ratio.jl")
 include("by.jl")
+include("jackknife.jl")
 
 export load_data
 export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
@@ -35,5 +36,6 @@ export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export ratio
+export jk1weights, jackknifeweights
 
 end
diff --git a/src/jackknife.jl b/src/jackknife.jl
new file mode 100644
index 00000000..43e54f2e
--- /dev/null
+++ b/src/jackknife.jl
@@ -0,0 +1,82 @@
+"""
+    Delete 1 jackknife with no/single stratum
+    
+    ## Reference
+    pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
+function jk1weights(design::SurveyDesign)
+    cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting?
+    psus = unique(cluster_sorted[!, design.cluster])
+    nh = length(psus)
+    npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
+    replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
+    cluster_weights = cluster_sorted[!, design.weights]
+    for (i,replicate) in enumerate(replicates)
+        cluster_sorted[!, "replicate_"*string(i)] = vcat(
+            [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
+            ]...,
+        ) .* cluster_weights
+    end
+    return cluster_sorted
+end
+
+"""
+    WIP: Delete-1 jackknife algorithm for replicate weights columns
+
+    ## Reference
+    pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
+function jackknifeweights(design::SurveyDesign)
+    stratified = groupby(design.data, design.strata)
+    H = length(keys(stratified))
+    @show H
+    substrata_dfs = DataFrame[]
+    for h in 1:H
+        @show h
+        substrata = DataFrame(stratified[h])
+        cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting?
+        psus = unique(cluster_sorted[!, design.cluster])
+        # cluster_weights = substrata[!, design.weights]
+        # gdf = groupby(substrata, design.cluster)
+        # psus = unique(substrata[!, design.cluster])
+        nh = length(psus)
+        npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
+        replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
+        @show nh, psus, npsus
+        # @show replicates
+        cluster_weights = cluster_sorted[!, design.weights]
+        @show size(cluster_weights)
+        for (i,replicate) in enumerate(replicates)
+            tmp = vcat(
+                    [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
+                    ]...,
+                ) .* cluster_weights
+            @show tmp
+            cluster_sorted[!, "replicate_"*string(i)] = tmp
+            
+        end
+        push!(substrata_dfs, cluster_sorted)
+        return vcat(substrata_dfs...)
+    end
+end
+        # push!(cluster_weights, [(count(==(i), replicate)) for i in 1:nh].*(nh/(nh-1)) )# main jackknife algo. 
+        # for j in 1:nh 
+        #     clusterweight = clusterweights[j]
+        #     for i in 1:nh 
+        #         gdf[i][!,"replicate_"*string(j)] = clusterweight[i].*DataFrame(gdf[i]).weights
+        #     end
+        # end
+        # stratified[h].replicate_1 = DataFrame(gdf).replicate_1
+    
+    # df = vcat(substrata_dfs...)
+    # return ReplicateDesign(
+    #     df,
+    #     design.cluster,
+    #     design.popsize,
+    #     design.sampsize,
+    #     design.strata,
+    #     design.weights,
+    #     design.allprobs,
+    #     design.pps,
+    #     replicates,
+    # )
diff --git a/test/jackknife.jl b/test/jackknife.jl
new file mode 100644
index 00000000..7578998f
--- /dev/null
+++ b/test/jackknife.jl
@@ -0,0 +1,4 @@
+@testset "jackknife" begin
+    dclus1_jack = dclus1 |> jackknifeweights
+    dclus1_jk1 = dclus1 |> jk1weights # Create replicate design
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index f2fc89d4..d7e8572e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -35,3 +35,4 @@ include("hist.jl")
 include("boxplot.jl")
 include("ratio.jl")
 include("show.jl")
+# include("jackknife.jl")
\ No newline at end of file

From 7ef490e0f77b273900c25d9c1e81f34277e8910f Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Sun, 26 Feb 2023 12:29:19 +0530
Subject: [PATCH 16/63] return ReplicateDesign class

---
 src/jackknife.jl | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 43e54f2e..a48daac6 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -1,6 +1,9 @@
 """
-    Delete 1 jackknife with no/single stratum
+    Delete 1 jackknife with  for unstratified designs.
     
+    Replicate weights are given by:
+    wi(hj)  = 0, if observation unit i is in psu j of stratum h
+            = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
     ## Reference
     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
 """
@@ -17,11 +20,26 @@ function jk1weights(design::SurveyDesign)
             ]...,
         ) .* cluster_weights
     end
-    return cluster_sorted
+    return ReplicateDesign(
+        cluster_sorted,
+        design.cluster,
+        design.popsize,
+        design.sampsize,
+        design.strata,
+        design.weights,
+        design.allprobs,
+        design.pps,
+        length(replicates),
+    )
 end
 
 """
     WIP: Delete-1 jackknife algorithm for replicate weights columns
+    
+    Replicate weights are given by:
+    wi(hj)  = wi, if observation unit i is not in stratum h
+            = 0, if observation unit i is in psu j of stratum h
+            = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
 
     ## Reference
     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)

From 6668e042f55df635fa4e14cd15d55f1e156b0d3b Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Sun, 26 Feb 2023 17:04:29 +0530
Subject: [PATCH 17/63] simple jkn algo, add estimator

---
 src/jackknife.jl | 115 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 85 insertions(+), 30 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index a48daac6..17bc97b3 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -33,6 +33,22 @@ function jk1weights(design::SurveyDesign)
     )
 end
 
+"""
+    Jackknife estimator
+
+    Uses jackknife replicate weights to calculate variance of estimator θ̂
+"""
+function JackknifeEstimator(θ̂::Float64,design::ReplicateDesign)
+    gdf = groupby(design.data,design.strata)
+    # @show keys(gdf)
+    calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf)
+    calc_df[!,:multiplier] = (calc_df[!,:x1] .- 1 ) ./ calc_df[!,:x1]
+    # for each_gdf in gdf
+    #     @show nrow(each_gdf)
+    # end
+    return calc_df
+end
+
 """
     WIP: Delete-1 jackknife algorithm for replicate weights columns
     
@@ -45,38 +61,77 @@ end
     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
 """
 function jackknifeweights(design::SurveyDesign)
-    stratified = groupby(design.data, design.strata)
-    H = length(keys(stratified))
-    @show H
-    substrata_dfs = DataFrame[]
-    for h in 1:H
-        @show h
-        substrata = DataFrame(stratified[h])
-        cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting?
-        psus = unique(cluster_sorted[!, design.cluster])
-        # cluster_weights = substrata[!, design.weights]
-        # gdf = groupby(substrata, design.cluster)
-        # psus = unique(substrata[!, design.cluster])
-        nh = length(psus)
-        npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
-        replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
-        @show nh, psus, npsus
-        # @show replicates
-        cluster_weights = cluster_sorted[!, design.weights]
-        @show size(cluster_weights)
-        for (i,replicate) in enumerate(replicates)
-            tmp = vcat(
-                    [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
-                    ]...,
-                ) .* cluster_weights
-            @show tmp
-            cluster_sorted[!, "replicate_"*string(i)] = tmp
-            
-        end
-        push!(substrata_dfs, cluster_sorted)
-        return vcat(substrata_dfs...)
+    df = design.data
+    ## Find number of psus (nh) in each strata, used inside loop
+    stratified = groupby(df, design.strata)
+    nh_df = combine(x -> length(unique(x[!,design.cluster])), gdf)
+    
+    # Iterate over each combination of strata X cluster
+    unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
+    counter = 1
+    for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2])
+        colname = "replicate_"*string(counter)
+        @show colname, strata, psu
+        ###### three mutually exclusive cases based on strata and psu
+
+        ### if observation unit i is not in stratum h
+        not_in_strata = df[df[!,design.strata] .!= strata,:] |> nrow
+        # Set replicate weights at these indices to: wi  
+        
+        ### if observation unit i is in psu j of stratum h
+        in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:]
+        # Set replicate weights at these indices to: 0 
+        
+        ### if observation unit i is in stratum h but not in psu j.
+        in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:]
+        # Set replicate weights at these indices to: (nh-1)/nh
+        counter += 1
     end
+    return
 end
+
+    # @show unique_strata_cols_df
+    # stratas = unique(df[!,design.strata])
+    # psus = unique(df[!,design.cluster])
+        # for i in 1:I
+            
+        #     if design.data[i,design.weights]
+        #     design.data[i,colname] =  
+        # end
+# sort!(design.data, (design.strata,design.cluster))
+    # stratified = groupby(design.data, design.strata)
+    # H = length(keys(stratified))
+    # @show H
+    # calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf).x1
+    # substrata_dfs = DataFrame[]
+    # for h in 1:H
+    #     @show h
+    #     substrata = DataFrame(stratified[h])
+    #     cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting?
+    #     psus = unique(cluster_sorted[!, design.cluster])
+    #     # cluster_weights = substrata[!, design.weights]
+    #     # gdf = groupby(substrata, design.cluster)
+    #     # psus = unique(substrata[!, design.cluster])
+    #     nh = length(psus)
+    #     npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
+    #     replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
+    #     @show nh, psus, npsus
+    #     # @show replicates
+    #     cluster_weights = cluster_sorted[!, design.weights]
+    #     @show size(cluster_weights)
+    #     for (i,replicate) in enumerate(replicates)
+    #         tmp = vcat(
+    #                 [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
+    #                 ]...,
+    #             ) .* cluster_weights
+    #         @show tmp
+    #         cluster_sorted[!, "replicate_"*string(i)] = tmp
+            
+    #     end
+    #     push!(substrata_dfs, cluster_sorted)
+    #     return vcat(substrata_dfs...)
+    # end
+
         # push!(cluster_weights, [(count(==(i), replicate)) for i in 1:nh].*(nh/(nh-1)) )# main jackknife algo. 
         # for j in 1:nh 
         #     clusterweight = clusterweights[j]

From cdf943021ce06e587269e7b180e5611a532ac5d4 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Sun, 26 Feb 2023 17:04:53 +0530
Subject: [PATCH 18/63] Add nhanes to runtests

---
 src/Survey.jl     | 2 +-
 test/jackknife.jl | 6 ++++--
 test/runtests.jl  | 5 +++++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/Survey.jl b/src/Survey.jl
index 884453c6..91a0eb81 100644
--- a/src/Survey.jl
+++ b/src/Survey.jl
@@ -36,6 +36,6 @@ export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export ratio
-export jk1weights, jackknifeweights
+export jk1weights, JackknifeEstimator, jackknifeweights
 
 end
diff --git a/test/jackknife.jl b/test/jackknife.jl
index 7578998f..b33bd25b 100644
--- a/test/jackknife.jl
+++ b/test/jackknife.jl
@@ -1,4 +1,6 @@
 @testset "jackknife" begin
-    dclus1_jack = dclus1 |> jackknifeweights
+    dclus1_jack = dnhanes |> jackknifeweights
     dclus1_jk1 = dclus1 |> jk1weights # Create replicate design
-end
\ No newline at end of file
+end
+dclus1_jk1 = dclus1 |> jk1weights
+dclus1_jack = dnhanes |> jackknifeweights
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index d7e8572e..28bd8cae 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -20,6 +20,11 @@ apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake fo
 dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus1_boot = dclus1 |> bootweights # Create replicate design
 
+# NHANES
+nhanes = load_data("nhanes")
+dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR)
+dnhanes_boot = dnhanes |> bootweights
+
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)
     @test size(load_data("apiclus2")) == (126, 41)

From 92becd4244a624460d0e2a51e0f5ee1b24d244d9 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Sun, 26 Feb 2023 17:34:47 +0530
Subject: [PATCH 19/63] nh-1 multiplier

---
 src/jackknife.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 17bc97b3..3a5411a2 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -65,6 +65,7 @@ function jackknifeweights(design::SurveyDesign)
     ## Find number of psus (nh) in each strata, used inside loop
     stratified = groupby(df, design.strata)
     nh_df = combine(x -> length(unique(x[!,design.cluster])), gdf)
+    nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) 
     
     # Iterate over each combination of strata X cluster
     unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
@@ -75,7 +76,7 @@ function jackknifeweights(design::SurveyDesign)
         ###### three mutually exclusive cases based on strata and psu
 
         ### if observation unit i is not in stratum h
-        not_in_strata = df[df[!,design.strata] .!= strata,:] |> nrow
+        not_in_strata = df[df[!,design.strata] .!= strata,:]
         # Set replicate weights at these indices to: wi  
         
         ### if observation unit i is in psu j of stratum h
@@ -84,7 +85,8 @@ function jackknifeweights(design::SurveyDesign)
         
         ### if observation unit i is in stratum h but not in psu j.
         in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:]
-        # Set replicate weights at these indices to: (nh-1)/nh
+        # Set replicate weights at these indices to: nh/(nh-1)
+        
         counter += 1
     end
     return

From d36581b6ef7234f3e074cece01dff740ab25d1fb Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Sun, 26 Feb 2023 17:36:06 +0530
Subject: [PATCH 20/63] fix comment

---
 src/jackknife.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 3a5411a2..11f340a0 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -77,15 +77,15 @@ function jackknifeweights(design::SurveyDesign)
 
         ### if observation unit i is not in stratum h
         not_in_strata = df[df[!,design.strata] .!= strata,:]
-        # Set replicate weights at these indices to: wi  
+        # Set replicate weight at these indices to: wi  
         
         ### if observation unit i is in psu j of stratum h
         in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:]
-        # Set replicate weights at these indices to: 0 
+        # Set replicate weight at these indices to: 0 
         
         ### if observation unit i is in stratum h but not in psu j.
         in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:]
-        # Set replicate weights at these indices to: nh/(nh-1)
+        # Set replicate weight at these indices to: nh/(nh-1) * wi
         
         counter += 1
     end

From 7e61dd17f25ee2c13901dda48b65101959b553dd Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Mon, 27 Feb 2023 19:58:21 +0530
Subject: [PATCH 21/63] add wi to jknife

---
 src/jackknife.jl | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 11f340a0..9b97c076 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -63,23 +63,29 @@ end
 function jackknifeweights(design::SurveyDesign)
     df = design.data
     ## Find number of psus (nh) in each strata, used inside loop
-    stratified = groupby(df, design.strata)
-    nh_df = combine(x -> length(unique(x[!,design.cluster])), gdf)
+    stratified_gdf = groupby(df, design.strata)
+    nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf)
     nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) 
     
     # Iterate over each combination of strata X cluster
     unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
     counter = 1
+    wᵢ = df[:,design.weights]
     for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2])
         colname = "replicate_"*string(counter)
         @show colname, strata, psu
+
+        # Initialise replicate_i with original sampling weights
+        df[!,colname] = wᵢ
+
         ###### three mutually exclusive cases based on strata and psu
 
-        ### if observation unit i is not in stratum h
-        not_in_strata = df[df[!,design.strata] .!= strata,:]
-        # Set replicate weight at these indices to: wi  
-        
+        ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata)
+        # not_in_strata = df[df[!,design.strata] .!= strata,:]
+        # Keep replicate weight at these indices to: wi  
+
         ### if observation unit i is in psu j of stratum h
+        # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu)
         in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:]
         # Set replicate weight at these indices to: 0 
         

From 0a656e4d04e5824e39d5dff9b936f40dce9122b3 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Tue, 28 Feb 2023 01:29:58 +0530
Subject: [PATCH 22/63] Changing the names of `replicate_weights` to the form
 `replicate_i`.

---
 src/SurveyDesign.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index 43acee38..e616eaf3 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -244,6 +244,9 @@ struct ReplicateDesign <: AbstractSurveyDesign
         popsize::Union{Nothing,Symbol} = nothing,
         weights::Union{Nothing,Symbol} = nothing
     )
+        # rename the replicate weights if needed
+        rename!(data, [replicate_weights[index] => "replicate_"*string(index) for index in 1:length(replicate_weights)])
+
         # call the SurveyDesign constructor
         base_design = SurveyDesign(
                         data;

From 7fa80ee7876a6bf34fa338d903a238dd7e9b36c2 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Tue, 28 Feb 2023 01:50:31 +0530
Subject: [PATCH 23/63] Adding constructor for `ReplicateDesign` which takes a
 `UnitRange`.

---
 src/SurveyDesign.jl | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index e616eaf3..f574bcc4 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -269,4 +269,22 @@ struct ReplicateDesign <: AbstractSurveyDesign
             replicate_weights
         )
     end
+
+    # replicate weights given as a range of columns
+    ReplicateDesign(
+        data::AbstractDataFrame,
+        replicate_weights::UnitRange{Int};
+        clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+        strata::Union{Nothing,Symbol} = nothing,
+        popsize::Union{Nothing,Symbol} = nothing,
+        weights::Union{Nothing,Symbol} = nothing
+    ) =
+        ReplicateDesign(
+            data,
+            Symbol.(names(data)[replicate_weights]);
+            clusters=clusters,
+            strata=strata,
+            popsize=popsize,
+            weights=weights
+        )
 end

From 648f1a5813704ce046423424d761c4266997d9ae Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Tue, 28 Feb 2023 02:02:17 +0530
Subject: [PATCH 24/63] Adding constructor for `ReplicateDesign` which takes a
 regular expression.

---
 src/SurveyDesign.jl | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index f574bcc4..c499d869 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -287,4 +287,22 @@ struct ReplicateDesign <: AbstractSurveyDesign
             popsize=popsize,
             weights=weights
         )
+
+    # replicate weights given as regular expression
+    ReplicateDesign(
+        data::AbstractDataFrame,
+        replicate_weights::Regex;
+        clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+        strata::Union{Nothing,Symbol} = nothing,
+        popsize::Union{Nothing,Symbol} = nothing,
+        weights::Union{Nothing,Symbol} = nothing
+    ) =
+        ReplicateDesign(
+            data,
+            Symbol.(names(data)[findall(name -> occursin(replicate_weights, name), names(data))]);
+            clusters=clusters,
+            strata=strata,
+            popsize=popsize,
+            weights=weights
+        )
 end

From a88c51bdfd2710bd1c4e2ca02eea2bfda9592575 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Tue, 28 Feb 2023 16:52:18 +0530
Subject: [PATCH 25/63] Adding docs for new constructors of `ReplicateDesign`.

---
 src/SurveyDesign.jl | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index c499d869..21904c0a 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -141,13 +141,36 @@ ReplicateDesign(
     popsize::Union{Nothing,Symbol} = nothing,
     weights::Union{Nothing,Symbol} = nothing
 )
+
+ReplicateDesign(
+    data::AbstractDataFrame,
+    replicate_weights::UnitIndex{Int};
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+    strata::Union{Nothing,Symbol} = nothing,
+    popsize::Union{Nothing,Symbol} = nothing,
+    weights::Union{Nothing,Symbol} = nothing
+)
+
+ReplicateDesign(
+    data::AbstractDataFrame,
+    replicate_weights::Regex;
+    clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
+    strata::Union{Nothing,Symbol} = nothing,
+    popsize::Union{Nothing,Symbol} = nothing,
+    weights::Union{Nothing,Symbol} = nothing
+)
 ```
 
 # Arguments
 
-The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which
-must be a `Vector` of `Symbols`. The `Symbol`s should represent the columns of `data` which contain the replicate weights. The
-names of these columns must be of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weights.
+The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can
+be of one of the following types.
+
+- `Vector{Symbol}`: In this case, each `Symbol` in the vector should represent a column of `data` containing the replicate weights.
+- `UnitIndex{Int}`: For instance, this could be UnitRange(5:10). This will mean that the replicate weights are contained in columns 5 through 10.
+- `Regex`: In this case, all the columns of `data` which match this `Regex` will be treated as the columns containing the replicate weights.
+
+All the columns containing the replicate weights will be renamed to the form `replicate_i`, where `i` ranges from 1 to the number of columns containing the replicate weights.
 
 # Examples
 
@@ -174,21 +197,21 @@ replicates: 1000
 ```
 
 If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in
-the above example, suppose we had the `bootstrat` data as a CSV file.
+the above example, suppose we had the `bootstrat` data as a CSV file (for this example, we also rename the columns containing the replicate weights to the form `r_i`).
 
 ```jldoctest replicate-design
 julia> using CSV;
 
+julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000])
+
 julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
 
 ```
 
-We can now pass the replicate weights directly to the `ReplicateDesign` constructor.
+We can now pass the replicate weights directly to the `ReplicateDesign` constructor, either as a `Vector{Symbol}`, a `UnitRange` or a `Regex`.
 
 ```jldoctest replicate-design
-julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame);
-
-julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv, [Symbol("replicate_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)
+julia> bootstrat_direct = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)
 ReplicateDesign:
 data: 200×1044 DataFrame
 strata: stype
@@ -201,6 +224,10 @@ allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
 type: bootstrap
 replicates: 1000
 
+julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw)
+
+julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw)
+
 ```
 
 """

From 9ce6a7379c1f9b508d3dbcfb7ec6736f84a5a584 Mon Sep 17 00:00:00 2001
From: Debartha Paul <pauldeep102@gmail.com>
Date: Tue, 28 Feb 2023 22:51:25 +0530
Subject: [PATCH 26/63] Added apiclus2 for runtests

---
 test/runtests.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index f2fc89d4..084a0317 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -20,6 +20,12 @@ apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake fo
 dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus1_boot = dclus1 |> bootweights # Create replicate design
 
+# Two-stage cluster sample
+apiclus2 = load_data("apiclus2") # Load API dataset
+apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake for pw column
+dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
+dclus2_boot = dclus2 |> bootweights # Create replicate design
+
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)
     @test size(load_data("apiclus2")) == (126, 41)

From a7c6f80b5e32ffe4cb34c2754c84521f00d21dae Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Wed, 1 Mar 2023 00:22:31 +0530
Subject: [PATCH 27/63] Adding tests for the new `ReplicateDesign`
 constructors.

---
 test/SurveyDesign.jl | 30 +++++++++++++++++++++++++++++-
 test/runtests.jl     | 19 +++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl
index 2c18f48a..5084c1e4 100644
--- a/test/SurveyDesign.jl
+++ b/test/SurveyDesign.jl
@@ -260,7 +260,7 @@ end
     dyrbs = SurveyDesign(yrbs; clusters = :psu, strata = :stratum, weights = :weight)
 end
 
-@testset "ReplicateDesign_constructor" begin
+@testset "ReplicateDesign_direct" begin
     for (sample, sample_direct) in [(bsrs, bsrs_direct), (bstrat, bstrat_direct), (dclus1_boot, dclus1_boot_direct)]
         @test isequal(sample.data, sample_direct.data)
         @test isequal(sample.popsize, sample_direct.popsize)
@@ -273,3 +273,31 @@ end
         @test isequal(sample.replicate_weights, sample_direct.replicate_weights)
     end
 end
+
+@testset "ReplicateDesign_unitrange" begin
+    for (sample, sample_unitrange) in [(bsrs, bsrs_unitrange), (bstrat, bstrat_unitrange), (dclus1_boot, dclus1_boot_unitrange)]
+        @test isequal(sample.data, sample_unitrange.data)
+        @test isequal(sample.popsize, sample_unitrange.popsize)
+        @test isequal(sample.sampsize, sample_unitrange.sampsize)
+        @test isequal(sample.strata, sample_unitrange.strata)
+        @test isequal(sample.weights, sample_unitrange.weights)
+        @test isequal(sample.allprobs, sample_unitrange.allprobs)
+        @test isequal(sample.pps, sample_unitrange.pps)
+        @test isequal(sample.replicates, sample_unitrange.replicates)
+        @test isequal(sample.replicate_weights, sample_unitrange.replicate_weights)
+    end
+end
+
+@testset "ReplicateDesign_regex" begin
+    for (sample, sample_regex) in [(bsrs, bsrs_regex), (bstrat, bstrat_regex), (dclus1_boot, dclus1_boot_regex)]
+        @test isequal(sample.data, sample_regex.data)
+        @test isequal(sample.popsize, sample_regex.popsize)
+        @test isequal(sample.sampsize, sample_regex.sampsize)
+        @test isequal(sample.strata, sample_regex.strata)
+        @test isequal(sample.weights, sample_regex.weights)
+        @test isequal(sample.allprobs, sample_regex.allprobs)
+        @test isequal(sample.pps, sample_regex.pps)
+        @test isequal(sample.replicates, sample_regex.replicates)
+        @test isequal(sample.replicate_weights, sample_regex.replicate_weights)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0485dbaa..61d6637b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,26 +4,37 @@ using CategoricalArrays
 
 const STAT_TOL = 1e-5
 const SE_TOL = 1e-1
-const REPLICATE_WEIGHTS = [Symbol("replicate_"*string(i)) for i in 1:4000]
+TOTAL_REPLICATES = 4000
+REPLICATES_VECTOR = [Symbol("replicate_"*string(i)) for i in 1:TOTAL_REPLICATES]
+REPLICATES_REGEX = r"r*_\d"
 
 # Simple random sample
 apisrs = load_data("apisrs") # Load API dataset
 srs = SurveyDesign(apisrs, weights = :pw)
+unitrange = UnitRange((length(names(apisrs)) + 1):(TOTAL_REPLICATES + length(names(apisrs))))
 bsrs = srs |> bootweights # Create replicate design
-bsrs_direct = ReplicateDesign(bsrs.data, REPLICATE_WEIGHTS, weights = :pw)  # using ReplicateDesign constructor 
+bsrs_direct = ReplicateDesign(bsrs.data, REPLICATES_VECTOR, weights = :pw)  # using ReplicateDesign constructor
+bsrs_unitrange = ReplicateDesign(bsrs.data, unitrange, weights = :pw)  # using ReplicateDesign constructor
+bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw)  # using ReplicateDesign constructor
 
 # Stratified sample
 apistrat = load_data("apistrat") # Load API dataset
 dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign
+unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat))))
 bstrat = dstrat |> bootweights # Create replicate design
-bstrat_direct = ReplicateDesign(bstrat.data, REPLICATE_WEIGHTS, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
+bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
+bstrat_unitrange = ReplicateDesign(bstrat.data, unitrange, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
+bstrat_regex = ReplicateDesign(bstrat.data, REPLICATES_REGEX, strata=:stype, weights=:pw)  # using ReplicateDesign constructor
 
 # One-stage cluster sample
 apiclus1 = load_data("apiclus1") # Load API dataset
 apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake for pw column
 dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign
+unitrange = UnitRange((length(names(apiclus1)) + 1):(TOTAL_REPLICATES + length(names(apiclus1))))
 dclus1_boot = dclus1 |> bootweights # Create replicate design
-dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATE_WEIGHTS, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
+dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATES_VECTOR, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
+dclus1_boot_unitrange = ReplicateDesign(dclus1_boot.data, unitrange, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
+dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters=:dnum, weights=:pw)  # using ReplicateDesign constructor
 
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)

From 327dd14b7aa91f04252a1ffb6993017f2e02adeb Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Wed, 1 Mar 2023 00:48:23 +0530
Subject: [PATCH 28/63] Fixing doctests in `ReplicateDesign`.

---
 src/SurveyDesign.jl | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl
index 21904c0a..09d986a1 100644
--- a/src/SurveyDesign.jl
+++ b/src/SurveyDesign.jl
@@ -202,7 +202,7 @@ the above example, suppose we had the `bootstrat` data as a CSV file (for this e
 ```jldoctest replicate-design
 julia> using CSV;
 
-julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000])
+julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000]);
 
 julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
 
@@ -225,8 +225,30 @@ type: bootstrap
 replicates: 1000
 
 julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw)
+ReplicateDesign:
+data: 200×1044 DataFrame
+strata: stype
+    [E, E, E  …  H]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
+replicates: 1000
 
 julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw)
+ReplicateDesign:
+data: 200×1044 DataFrame
+strata: stype
+    [E, E, E  …  H]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  755.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  15.1]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0662]
+type: bootstrap
+replicates: 1000
 
 ```
 

From 36f6dc8dde0a6742dcba12a223ff344f6f413c55 Mon Sep 17 00:00:00 2001
From: Debartha Paul <pauldeep102@gmail.com>
Date: Tue, 7 Mar 2023 12:31:57 +0530
Subject: [PATCH 29/63] Added tests for ratio

Tests for ratio estimation of stratified sampling
---
 test/ratio.jl    | 3 +++
 test/runtests.jl | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/test/ratio.jl b/test/ratio.jl
index d0f5f975..f3326e1e 100644
--- a/test/ratio.jl
+++ b/test/ratio.jl
@@ -2,4 +2,7 @@
     @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4
     @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1
     @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4
+    @test ratio(:api00, :enroll, dstrat).ratio[1] ≈ 1.11256 atol = 1e-4
+    @test ratio(:api00, :enroll, dstrat_boot).SE[1] ≈ 0.04185 atol = 1e-1
+    @test ratio(:api00, :enroll, dstrat_boot).ratio[1] ≈ 1.11256 atol = 1e-4
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 5f6b748b..064c3b1c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -42,6 +42,11 @@ apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake fo
 dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus2_boot = dclus2 |> bootweights # Create replicate design
 
+# Stratified sample
+apistrat = load_data("apistrat") # Load API dataset
+dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw, popsize = :fpc) # Create SurveyDesign
+dstrat_boot = dstrat |> bootweights # Create replicate design
+
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)
     @test size(load_data("apiclus2")) == (126, 41)

From 26a1e912fa663fc9dfd1bf2832b24e7065e3dde6 Mon Sep 17 00:00:00 2001
From: Debartha Paul <pauldeep102@gmail.com>
Date: Wed, 8 Mar 2023 15:40:15 +0530
Subject: [PATCH 30/63] Modified stratified design

Removed finite population correction in `dstrat`
---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 064c3b1c..1da4a1a5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -44,7 +44,7 @@ dclus2_boot = dclus2 |> bootweights # Create replicate design
 
 # Stratified sample
 apistrat = load_data("apistrat") # Load API dataset
-dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw, popsize = :fpc) # Create SurveyDesign
+dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw) # Create SurveyDesign
 dstrat_boot = dstrat |> bootweights # Create replicate design
 
 @testset "Survey.jl" begin

From 46268021320e6000c2b7765c81a1b8e67d4d68f9 Mon Sep 17 00:00:00 2001
From: Debartha Paul <pauldeep102@gmail.com>
Date: Fri, 10 Mar 2023 08:30:10 +0530
Subject: [PATCH 31/63] Update test/runtests.jl

Remove recurring definitions for `apistrat`

Co-authored-by: Ayush Patnaik <u6012645@anu.edu.au>
---
 test/runtests.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 1da4a1a5..5f6b748b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -42,11 +42,6 @@ apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake fo
 dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus2_boot = dclus2 |> bootweights # Create replicate design
 
-# Stratified sample
-apistrat = load_data("apistrat") # Load API dataset
-dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw) # Create SurveyDesign
-dstrat_boot = dstrat |> bootweights # Create replicate design
-
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)
     @test size(load_data("apiclus2")) == (126, 41)

From 424bde1c24e92090a058bbebed6a4ec1b6e4a459 Mon Sep 17 00:00:00 2001
From: Debartha Paul <pauldeep102@gmail.com>
Date: Fri, 10 Mar 2023 08:55:09 +0530
Subject: [PATCH 32/63] Added replicate design for stratified

---
 test/runtests.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5f6b748b..e06effc7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -20,6 +20,7 @@ bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw)  # usin
 # Stratified sample
 apistrat = load_data("apistrat") # Load API dataset
 dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign
+dstrat_boot = dstrat |> bootweights # Create replicate design
 unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat))))
 bstrat = dstrat |> bootweights # Create replicate design
 bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw)  # using ReplicateDesign constructor

From 4c2dd2146f12615cc4057dfc6c2d702cb1ffb026 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 10 Mar 2023 15:27:52 +0530
Subject: [PATCH 33/63] Adding `jackknifeweights` implementation.

---
 src/jackknife.jl | 183 +++++++++++++++++++++++++++++------------------
 1 file changed, 115 insertions(+), 68 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 9b97c076..2607a0a5 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -1,37 +1,37 @@
-"""
-    Delete 1 jackknife with  for unstratified designs.
+# """
+#     Delete 1 jackknife with  for unstratified designs.
     
-    Replicate weights are given by:
-    wi(hj)  = 0, if observation unit i is in psu j of stratum h
-            = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
-    ## Reference
-    pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
-"""
-function jk1weights(design::SurveyDesign)
-    cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting?
-    psus = unique(cluster_sorted[!, design.cluster])
-    nh = length(psus)
-    npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
-    replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
-    cluster_weights = cluster_sorted[!, design.weights]
-    for (i,replicate) in enumerate(replicates)
-        cluster_sorted[!, "replicate_"*string(i)] = vcat(
-            [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
-            ]...,
-        ) .* cluster_weights
-    end
-    return ReplicateDesign(
-        cluster_sorted,
-        design.cluster,
-        design.popsize,
-        design.sampsize,
-        design.strata,
-        design.weights,
-        design.allprobs,
-        design.pps,
-        length(replicates),
-    )
-end
+#     Replicate weights are given by:
+#     wi(hj)  = 0, if observation unit i is in psu j of stratum h
+#             = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
+#     ## Reference
+#     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+# """
+# function jk1weights(design::SurveyDesign)
+#     cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting?
+#     psus = unique(cluster_sorted[!, design.cluster])
+#     nh = length(psus)
+#     npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
+#     replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
+#     cluster_weights = cluster_sorted[!, design.weights]
+#     for (i,replicate) in enumerate(replicates)
+#         cluster_sorted[!, "replicate_"*string(i)] = vcat(
+#             [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
+#             ]...,
+#         ) .* cluster_weights
+#     end
+#     return ReplicateDesign(
+#         cluster_sorted,
+#         design.cluster,
+#         design.popsize,
+#         design.sampsize,
+#         design.strata,
+#         design.weights,
+#         design.allprobs,
+#         design.pps,
+#         length(replicates),
+#     )
+# end
 
 """
     Jackknife estimator
@@ -49,53 +49,100 @@ function JackknifeEstimator(θ̂::Float64,design::ReplicateDesign)
     return calc_df
 end
 
-"""
-    WIP: Delete-1 jackknife algorithm for replicate weights columns
+# """
+#     WIP: Delete-1 jackknife algorithm for replicate weights columns
     
-    Replicate weights are given by:
-    wi(hj)  = wi, if observation unit i is not in stratum h
-            = 0, if observation unit i is in psu j of stratum h
-            = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
+#     Replicate weights are given by:
+#     wi(hj)  = wi, if observation unit i is not in stratum h
+#             = 0, if observation unit i is in psu j of stratum h
+#             = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
+
+#     ## Reference
+#     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+# """
+# function jackknifeweights(design::SurveyDesign)
+#     df = design.data
+#     ## Find number of psus (nh) in each strata, used inside loop
+#     stratified_gdf = groupby(df, design.strata)
+#     nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf)
+#     nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) 
+    
+#     # Iterate over each combination of strata X cluster
+#     unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
+#     counter = 1
+#     wᵢ = df[:,design.weights]
+#     for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2])
+#         colname = "replicate_"*string(counter)
+#         @show colname, strata, psu
+
+#         # Initialise replicate_i with original sampling weights
+#         df[!,colname] = wᵢ
+
+#         ###### three mutually exclusive cases based on strata and psu
+
+#         ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata)
+#         # not_in_strata = df[df[!,design.strata] .!= strata,:]
+#         # Keep replicate weight at these indices to: wi  
+
+#         ### if observation unit i is in psu j of stratum h
+#         # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu)
+#         in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:]
+#         # Set replicate weight at these indices to: 0 
+        
+#         ### if observation unit i is in stratum h but not in psu j.
+#         in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:]
+#         # Set replicate weight at these indices to: nh/(nh-1) * wi
+        
+#         counter += 1
+#     end
+#     return
+# end
 
-    ## Reference
-    pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
-"""
 function jackknifeweights(design::SurveyDesign)
     df = design.data
-    ## Find number of psus (nh) in each strata, used inside loop
+
+    # Find number of psus (nh) in each strata, used inside loop
     stratified_gdf = groupby(df, design.strata)
-    nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf)
-    nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) 
-    
-    # Iterate over each combination of strata X cluster
+    nh = Dict{String, Int}()
+    for (key, subgroup) in pairs(stratified_gdf)
+        nh[key[design.strata]] = length(unique(subgroup[!, design.cluster]))
+    end
+
+    # iterating over each unique combinations of strata and cluster
     unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
-    counter = 1
-    wᵢ = df[:,design.weights]
-    for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2])
-        colname = "replicate_"*string(counter)
-        @show colname, strata, psu
+    replicate_index = 1
+    for row in eachrow(unique_strata_cols_df)
+        stratum = row[design.strata]
+        psu = row[design.cluster]
+        colname = "replicate_"*string(replicate_index)
 
         # Initialise replicate_i with original sampling weights
-        df[!,colname] = wᵢ
+        df[!, colname] = Vector(df[!, design.weights])
 
-        ###### three mutually exclusive cases based on strata and psu
+        # getting indexes
+        same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
+        different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)
 
-        ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata)
-        # not_in_strata = df[df[!,design.strata] .!= strata,:]
-        # Keep replicate weight at these indices to: wi  
+        # scaling weights appropriately
+        df[same_psu, colname] .*= 0
+        df[different_psu, colname] .*= nh[stratum]/(nh[stratum] - 1)
 
-        ### if observation unit i is in psu j of stratum h
-        # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu)
-        in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:]
-        # Set replicate weight at these indices to: 0 
-        
-        ### if observation unit i is in stratum h but not in psu j.
-        in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:]
-        # Set replicate weight at these indices to: nh/(nh-1) * wi
-        
-        counter += 1
+        replicate_index += 1
     end
-    return
+
+    return ReplicateDesign(
+        df,
+        design.cluster,
+        design.popsize,
+        design.sampsize,
+        design.strata,
+        design.weights,
+        design.allprobs,
+        design.pps,
+        "jackknife",
+        UInt(DataFrames.nrow(unique_strata_cols_df)),
+        [Symbol("replicate_"*string(index)) for index in 1:DataFrames.nrow(unique_strata_cols_df)]
+    )
 end
 
     # @show unique_strata_cols_df

From 876fa7fd62b2ca52d0f89404f7f5c60d39aa7550 Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 13:41:46 +0530
Subject: [PATCH 34/63] Update test/runtests.jl

Co-authored-by: Ayush Patnaik <u6012645@anu.edu.au>
---
 test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index e06effc7..5f6b748b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -20,7 +20,6 @@ bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw)  # usin
 # Stratified sample
 apistrat = load_data("apistrat") # Load API dataset
 dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign
-dstrat_boot = dstrat |> bootweights # Create replicate design
 unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat))))
 bstrat = dstrat |> bootweights # Create replicate design
 bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw)  # using ReplicateDesign constructor

From 13e358a54127875ac686ac8d17dfa2142d4accc4 Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 13:42:35 +0530
Subject: [PATCH 35/63] Update ratio.jl

change to bstrat
---
 test/ratio.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/ratio.jl b/test/ratio.jl
index f3326e1e..3536a169 100644
--- a/test/ratio.jl
+++ b/test/ratio.jl
@@ -2,7 +2,7 @@
     @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4
     @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1
     @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4
-    @test ratio(:api00, :enroll, dstrat).ratio[1] ≈ 1.11256 atol = 1e-4
-    @test ratio(:api00, :enroll, dstrat_boot).SE[1] ≈ 0.04185 atol = 1e-1
-    @test ratio(:api00, :enroll, dstrat_boot).ratio[1] ≈ 1.11256 atol = 1e-4
+    @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4
+    @test ratio(:api00, :enroll, bstrat).SE[1] ≈ 0.04185 atol = 1e-1
+    @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4
 end

From 246ff130be637549eb2f7b8f68abd3c2edd683f7 Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 13:47:58 +0530
Subject: [PATCH 36/63] Update ratio.jl

combining #279 into this PR
---
 test/ratio.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/ratio.jl b/test/ratio.jl
index 3536a169..b9865f09 100644
--- a/test/ratio.jl
+++ b/test/ratio.jl
@@ -1,8 +1,14 @@
 @testset "ratio.jl" begin
+    # on :api00
+    @test ratio(:api00, :enroll, srs).ratio[1] ≈ 1.1231 atol = 1e-4
     @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4
     @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1
     @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4
     @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4
     @test ratio(:api00, :enroll, bstrat).SE[1] ≈ 0.04185 atol = 1e-1
     @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4
+    # on :api99
+    @test ratio(:api99, :enroll, bsrs).ratio[1] ≈ 1.06854 atol = 1e-4
+    @test ratio(:api99, :enroll, dstrat).ratio[1] ≈ 1.0573 atol = 1e-4
+    @test ratio(:api99, :enroll, bstrat).ratio[1] ≈ 1.0573 atol = 1e-4
 end

From 02136a551883dd69e146cdc0506952957a3d1fd6 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 14:41:59 +0530
Subject: [PATCH 37/63] remove commented code jkknife. rm jk1 only function

---
 src/jackknife.jl | 171 +++--------------------------------------------
 1 file changed, 9 insertions(+), 162 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 2607a0a5..1c15d676 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -1,103 +1,14 @@
-# """
-#     Delete 1 jackknife with  for unstratified designs.
-    
-#     Replicate weights are given by:
-#     wi(hj)  = 0, if observation unit i is in psu j of stratum h
-#             = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
-#     ## Reference
-#     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
-# """
-# function jk1weights(design::SurveyDesign)
-#     cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting?
-#     psus = unique(cluster_sorted[!, design.cluster])
-#     nh = length(psus)
-#     npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
-#     replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
-#     cluster_weights = cluster_sorted[!, design.weights]
-#     for (i,replicate) in enumerate(replicates)
-#         cluster_sorted[!, "replicate_"*string(i)] = vcat(
-#             [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
-#             ]...,
-#         ) .* cluster_weights
-#     end
-#     return ReplicateDesign(
-#         cluster_sorted,
-#         design.cluster,
-#         design.popsize,
-#         design.sampsize,
-#         design.strata,
-#         design.weights,
-#         design.allprobs,
-#         design.pps,
-#         length(replicates),
-#     )
-# end
-
-"""
-    Jackknife estimator
-
-    Uses jackknife replicate weights to calculate variance of estimator θ̂
 """
-function JackknifeEstimator(θ̂::Float64,design::ReplicateDesign)
-    gdf = groupby(design.data,design.strata)
-    # @show keys(gdf)
-    calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf)
-    calc_df[!,:multiplier] = (calc_df[!,:x1] .- 1 ) ./ calc_df[!,:x1]
-    # for each_gdf in gdf
-    #     @show nrow(each_gdf)
-    # end
-    return calc_df
-end
-
-# """
-#     WIP: Delete-1 jackknife algorithm for replicate weights columns
-    
-#     Replicate weights are given by:
-#     wi(hj)  = wi, if observation unit i is not in stratum h
-#             = 0, if observation unit i is in psu j of stratum h
-#             = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
-
-#     ## Reference
-#     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
-# """
-# function jackknifeweights(design::SurveyDesign)
-#     df = design.data
-#     ## Find number of psus (nh) in each strata, used inside loop
-#     stratified_gdf = groupby(df, design.strata)
-#     nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf)
-#     nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) 
+    Delete-1 Jackknife algorithm for replication weights from sampling weights
     
-#     # Iterate over each combination of strata X cluster
-#     unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
-#     counter = 1
-#     wᵢ = df[:,design.weights]
-#     for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2])
-#         colname = "replicate_"*string(counter)
-#         @show colname, strata, psu
-
-#         # Initialise replicate_i with original sampling weights
-#         df[!,colname] = wᵢ
-
-#         ###### three mutually exclusive cases based on strata and psu
-
-#         ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata)
-#         # not_in_strata = df[df[!,design.strata] .!= strata,:]
-#         # Keep replicate weight at these indices to: wi  
-
-#         ### if observation unit i is in psu j of stratum h
-#         # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu)
-#         in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:]
-#         # Set replicate weight at these indices to: 0 
-        
-#         ### if observation unit i is in stratum h but not in psu j.
-#         in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:]
-#         # Set replicate weight at these indices to: nh/(nh-1) * wi
-        
-#         counter += 1
-#     end
-#     return
-# end
+    Replicate weights are given by:
+    wi(hj)  = wi, if observation unit i is not in stratum h
+            = 0, if observation unit i is in psu j of stratum h
+            = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
 
+    ## Reference
+    pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
 function jackknifeweights(design::SurveyDesign)
     df = design.data
 
@@ -143,68 +54,4 @@ function jackknifeweights(design::SurveyDesign)
         UInt(DataFrames.nrow(unique_strata_cols_df)),
         [Symbol("replicate_"*string(index)) for index in 1:DataFrames.nrow(unique_strata_cols_df)]
     )
-end
-
-    # @show unique_strata_cols_df
-    # stratas = unique(df[!,design.strata])
-    # psus = unique(df[!,design.cluster])
-        # for i in 1:I
-            
-        #     if design.data[i,design.weights]
-        #     design.data[i,colname] =  
-        # end
-# sort!(design.data, (design.strata,design.cluster))
-    # stratified = groupby(design.data, design.strata)
-    # H = length(keys(stratified))
-    # @show H
-    # calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf).x1
-    # substrata_dfs = DataFrame[]
-    # for h in 1:H
-    #     @show h
-    #     substrata = DataFrame(stratified[h])
-    #     cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting?
-    #     psus = unique(cluster_sorted[!, design.cluster])
-    #     # cluster_weights = substrata[!, design.weights]
-    #     # gdf = groupby(substrata, design.cluster)
-    #     # psus = unique(substrata[!, design.cluster])
-    #     nh = length(psus)
-    #     npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]       
-    #     replicates = [filter(n -> n != i, 1:nh) for i in 1:nh]
-    #     @show nh, psus, npsus
-    #     # @show replicates
-    #     cluster_weights = cluster_sorted[!, design.weights]
-    #     @show size(cluster_weights)
-    #     for (i,replicate) in enumerate(replicates)
-    #         tmp = vcat(
-    #                 [   fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh
-    #                 ]...,
-    #             ) .* cluster_weights
-    #         @show tmp
-    #         cluster_sorted[!, "replicate_"*string(i)] = tmp
-            
-    #     end
-    #     push!(substrata_dfs, cluster_sorted)
-    #     return vcat(substrata_dfs...)
-    # end
-
-        # push!(cluster_weights, [(count(==(i), replicate)) for i in 1:nh].*(nh/(nh-1)) )# main jackknife algo. 
-        # for j in 1:nh 
-        #     clusterweight = clusterweights[j]
-        #     for i in 1:nh 
-        #         gdf[i][!,"replicate_"*string(j)] = clusterweight[i].*DataFrame(gdf[i]).weights
-        #     end
-        # end
-        # stratified[h].replicate_1 = DataFrame(gdf).replicate_1
-    
-    # df = vcat(substrata_dfs...)
-    # return ReplicateDesign(
-    #     df,
-    #     design.cluster,
-    #     design.popsize,
-    #     design.sampsize,
-    #     design.strata,
-    #     design.weights,
-    #     design.allprobs,
-    #     design.pps,
-    #     replicates,
-    # )
+end
\ No newline at end of file

From 22338c2607d944adb29308d614407e5405f3ae2f Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 14:43:55 +0530
Subject: [PATCH 38/63] rm exports

---
 src/Survey.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Survey.jl b/src/Survey.jl
index 91a0eb81..1be5b203 100644
--- a/src/Survey.jl
+++ b/src/Survey.jl
@@ -36,6 +36,6 @@ export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export ratio
-export jk1weights, JackknifeEstimator, jackknifeweights
+export jackknifeweights
 
 end

From 6aa8466c36f418e60a906da722758fac449d13b3 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 16:20:34 +0530
Subject: [PATCH 39/63] Sorting Replicatedesign jackknife for now

---
 src/jackknife.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 1c15d676..eefde9d6 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -10,6 +10,7 @@
     pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
 """
 function jackknifeweights(design::SurveyDesign)
+    sort!(design.data, [design.strata, design.cluster])
     df = design.data
 
     # Find number of psus (nh) in each strata, used inside loop

From 3ab7ccca8b8865c815e7c5e30aec40c8ba22ae41 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Tue, 14 Mar 2023 16:21:29 +0530
Subject: [PATCH 40/63] Modify jk mean. Start jk tests

---
 src/mean.jl       | 52 ++++++++++++++++++++++++++++++++++++++++-------
 test/jackknife.jl | 11 +++++-----
 test/runtests.jl  |  2 +-
 3 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/src/mean.jl b/src/mean.jl
index 4db9fed1..d13eb116 100644
--- a/src/mean.jl
+++ b/src/mean.jl
@@ -46,13 +46,51 @@ julia> mean(:api00, bclus1)
 ```
 """
 function mean(x::Symbol, design::ReplicateDesign)
-    X = mean(design.data[!, x], weights(design.data[!, design.weights]))
-    Xt = [
-        mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
-        i = 1:design.replicates
-    ]
-    variance = sum((Xt .- X) .^ 2) / design.replicates
-    DataFrame(mean = X, SE = sqrt(variance))
+    if design.type == "bootstrap"
+        θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
+        θ̂t = [
+            mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
+            i = 1:design.replicates
+        ]
+        variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
+    
+    elseif design.type == "jackknife"
+        
+        # The estimate based on original sampling weight vector
+        θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
+        
+        # Find number of psus (nh) in each strata, used inside loop
+        stratified_gdf = groupby(design.data, design.strata)
+        nh = Dict{String, Int}()
+        for (key, subgroup) in pairs(stratified_gdf)
+            nh[key[design.strata]] = length(unique(subgroup[!, design.cluster]))
+        end
+        
+        # iterating over each unique combinations of strata and cluster
+        unique_strata_cols_df = unique(select(design.data, design.strata, design.cluster))
+        variance = 0
+        replicate_index = 1
+        SSE_θ̂hj = 0
+        previous_strata = first(unique_strata_cols_df[!,design.strata])
+        for row in eachrow(unique_strata_cols_df)
+            stratum = row[design.strata]
+            # psu = row[design.cluster]
+            colname = "replicate_"*string(replicate_index)
+            
+            if stratum != previous_strata
+                variance += ((nh[stratum] - 1) / nh[stratum]) * SSE_θ̂hj
+                previous_strata = stratum
+
+                θ̂j = mean(design.data[!, x], weights(design.data[!, colname]))
+                SSE_θ̂hj = (θ̂j - θ̂)^2 
+            else
+                θ̂j = mean(design.data[!, x], weights(design.data[!, colname]))
+                SSE_θ̂hj += (θ̂j - θ̂)^2 
+            end
+            replicate_index += 1
+        end
+    end
+    DataFrame(mean = θ̂, SE = sqrt(variance))
 end
 
 """
diff --git a/test/jackknife.jl b/test/jackknife.jl
index b33bd25b..c6527ac8 100644
--- a/test/jackknife.jl
+++ b/test/jackknife.jl
@@ -1,6 +1,7 @@
 @testset "jackknife" begin
-    dclus1_jack = dnhanes |> jackknifeweights
-    dclus1_jk1 = dclus1 |> jk1weights # Create replicate design
-end
-dclus1_jk1 = dclus1 |> jk1weights
-dclus1_jack = dnhanes |> jackknifeweights
\ No newline at end of file
+    dstrat_jk = jackknifeweights(dstrat)
+    mean(:api00,dstrat_jk)
+    mean(:api99,dstrat_jk)
+    # dnhanes_jk = jackknifeweights(dnhanes)
+    # dclus1_jk = jackknifeweights(dclus1)
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 31f0fa84..bf3acbf3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -62,4 +62,4 @@ include("hist.jl")
 include("boxplot.jl")
 include("ratio.jl")
 include("show.jl")
-# include("jackknife.jl")
\ No newline at end of file
+include("jackknife.jl")
\ No newline at end of file

From 6071a43b9d6a3756ae9697b2cc829ddb3de36d0f Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Fri, 17 Mar 2023 22:37:18 +0530
Subject: [PATCH 41/63] Adding `jackknife_variance` implementation. Also
 shortening code for `jackknifeweights`.

---
 src/Survey.jl    |  2 +-
 src/jackknife.jl | 74 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/src/Survey.jl b/src/Survey.jl
index 1be5b203..6960cfb5 100644
--- a/src/Survey.jl
+++ b/src/Survey.jl
@@ -36,6 +36,6 @@ export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export ratio
-export jackknifeweights
+export jackknifeweights, jackknife_variance
 
 end
diff --git a/src/jackknife.jl b/src/jackknife.jl
index eefde9d6..c9e90be2 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -13,33 +13,28 @@ function jackknifeweights(design::SurveyDesign)
     sort!(design.data, [design.strata, design.cluster])
     df = design.data
 
-    # Find number of psus (nh) in each strata, used inside loop
     stratified_gdf = groupby(df, design.strata)
-    nh = Dict{String, Int}()
+    replicate_index = 0
     for (key, subgroup) in pairs(stratified_gdf)
-        nh[key[design.strata]] = length(unique(subgroup[!, design.cluster]))
-    end
-
-    # iterating over each unique combinations of strata and cluster
-    unique_strata_cols_df = unique(select(df, design.strata, design.cluster))
-    replicate_index = 1
-    for row in eachrow(unique_strata_cols_df)
-        stratum = row[design.strata]
-        psu = row[design.cluster]
-        colname = "replicate_"*string(replicate_index)
+        stratum = key[design.strata]    # current stratum
+        psus_in_stratum = unique(subgroup[!, design.cluster])
+        nh = length(psus_in_stratum)
 
-        # Initialise replicate_i with original sampling weights
-        df[!, colname] = Vector(df[!, design.weights])
+        for psu in psus_in_stratum
+            replicate_index += 1
+            colname = "replicate_"*string(replicate_index)
 
-        # getting indexes
-        same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
-        different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)
+            # Initialise replicate_i with original sampling weights
+            df[!, colname] = Vector(df[!, design.weights])
 
-        # scaling weights appropriately
-        df[same_psu, colname] .*= 0
-        df[different_psu, colname] .*= nh[stratum]/(nh[stratum] - 1)
+            # getting indexes
+            same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
+            different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)
 
-        replicate_index += 1
+            # scaling weights appropriately
+            df[same_psu, colname] .*= 0
+            df[different_psu, colname] .*= nh/(nh - 1)
+        end
     end
 
     return ReplicateDesign(
@@ -52,7 +47,38 @@ function jackknifeweights(design::SurveyDesign)
         design.allprobs,
         design.pps,
         "jackknife",
-        UInt(DataFrames.nrow(unique_strata_cols_df)),
-        [Symbol("replicate_"*string(index)) for index in 1:DataFrames.nrow(unique_strata_cols_df)]
+        UInt(replicate_index),
+        [Symbol("replicate_"*string(index)) for index in 1:replicate_index]
     )
-end
\ No newline at end of file
+end
+
+function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
+    df = design.data
+    # sort!(df, [design.strata, design.cluster])
+    stratified_gdf = groupby(df, design.strata)
+
+    # estimator from original weights
+    θ = func(df[!, x], df[!, design.weights])
+
+    variance = 0
+    replicate_index = 1
+    for subgroup in stratified_gdf
+        psus_in_stratum = unique(subgroup[!, design.cluster])
+        nh = length(psus_in_stratum)
+        cluster_variance = 0
+        for psu in psus_in_stratum
+            # get replicate weights corresponding to current stratum and psu
+            rep_weights = design.data[!, "replicate_"*string(replicate_index)]
+
+            # estimator from replicate weights
+            θhj = func(design.data[!, x], rep_weights)
+
+            cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
+            replicate_index += 1
+        end
+        variance += cluster_variance
+    end
+
+    return DataFrame(estimator = θ, SE = sqrt(variance))
+end
+

From 4f49122b7c955f9d631e63c3674210d30b501d00 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Sat, 18 Mar 2023 01:09:47 +0530
Subject: [PATCH 42/63] Adding preliminary docstrings for `jackknifeweights`
 and `jackknife_variance`.

---
 docs/src/api.md  |  2 ++
 src/jackknife.jl | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/docs/src/api.md b/docs/src/api.md
index 341dcbba..e3dd4f08 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -14,6 +14,8 @@ SurveyDesign
 ReplicateDesign
 load_data
 bootweights
+jackknifeweights
+jackknife_variance
 mean
 total
 quantile
diff --git a/src/jackknife.jl b/src/jackknife.jl
index c9e90be2..5f0e5428 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -1,13 +1,21 @@
 """
-    Delete-1 Jackknife algorithm for replication weights from sampling weights
-    
-    Replicate weights are given by:
-    wi(hj)  = wi, if observation unit i is not in stratum h
-            = 0, if observation unit i is in psu j of stratum h
-            = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j.
-
-    ## Reference
-    pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+```julia
+jackknifeweights(design::SurveyDesign)
+```
+Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula.
+
+```math
+w_{i(hj)} =
+\\begin{cases}
+    w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\
+    0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\
+    \\dfrac{n_h}{n_h - 1}w_i &\\quad\\quad\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
+\\end{cases}
+```
+The replicate weight columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. These are added to the underlying `design.data` `DataFrame`.
+
+# Reference
+pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
 """
 function jackknifeweights(design::SurveyDesign)
     sort!(design.data, [design.strata, design.cluster])
@@ -52,6 +60,17 @@ function jackknifeweights(design::SurveyDesign)
     )
 end
 
+"""
+```julia
+jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
+```
+
+Compute variance for the given `func` using the Jackknife method. The formula to compute this variance is the following.
+
+```math
+\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2
+```
+"""
 function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
     df = design.data
     # sort!(df, [design.strata, design.cluster])

From 10e7fb8355eefb85a5411fedc1c92afb9796546f Mon Sep 17 00:00:00 2001
From: nadiaenh <nadia.enhaili@gmail.com>
Date: Sun, 19 Mar 2023 02:58:20 -0700
Subject: [PATCH 43/63] added instructions for WSL + table of contents

---
 CONTRIBUTING.md | 51 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ca099c87..dd7aee97 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,6 +2,17 @@
 
 # Contributing to Survey.jl
 
+  * [Overview](#overview)
+  * [Reporting Issues](#reporting-issues)
+  * [Recommended workflow setup](#recommended-workflow-setup)
+  * [Modifying an existing docstring in `src/`](#modifying-an-existing-docstring-in--src--)
+  * [Adding a new docstring to `src/`](#adding-a-new-docstring-to--src--)
+  * [Doctests](#doctests)
+  * [Integration with exisiting API](#integration-with-exisiting-api)
+  * [Contributing](#contributing)
+  * [Style Guidelines](#style-guidelines)
+  * [Git Recommendations For Pull Requests](#git-recommendations-for-pull-requests)
+
 ## Overview
 Thank you for thinking about making contributions to Survey.jl!  
 We aim to keep consistency in contribution guidelines to DataFrames.jl, which is the main upstream dependency for the project. 
@@ -16,6 +27,46 @@ Reading through the ColPrac guide for collaborative practices is highly recommen
   (`Pkg.add(name="Survey", rev="main")`) is a good gut check and can streamline the process,
   along with including the first two lines of output from `versioninfo()`
 
+## Recommended workflow setup
+
+This tutorial assumes you will be using Windows Subsystem for Linux (WSL) and VSCode, which is recommended. 
+
+1. Install Ubuntu on WSL from the [Ubuntu website](https://ubuntu.com/wsl) or the Microsoft Store
+2. Create a fork of the [Survey.jl repository](https://github.com/xKDR/Survey.jl). You will only be ever working on this fork, and submitting Pull Requests to the main repo. 
+3. Copy the SSH link from your fork by clicking the green `<> Code` icon and then `SSH`. 
+    - You must already have SSH setup for this to work. If you don't, look up a tutorial on how to clone a github repository using SSH.
+4. Open a WSL terminal, and run :
+    - `curl -fsSL https://install.julialang.org | sh`
+    - `git clone git@github.com:your_username/Survey.jl.git` -- replace "*your_username**"
+    - `julia`
+3. You are now in the Julia REPL, run :
+    - `import Pkg; Pkg.add("Revise")`
+    - `import Pkg; Pkg.add("Survey")`
+    - `import Pkg; Pkg.add("Test")`
+    - `] dev .`
+4. Open VSCode and install the following extensions :
+    - WSL 
+    - Julia
+5. Go back to your WSL terminal, navigate to the folder of your repo, and run `code .` to open VSCode in that folder
+6. Create a `dev` folder, and a `test.jl` file. Paste this block of code and save :
+
+```julia
+using Revise, Survey, Test
+
+@testset "ratio.jl" begin
+    apiclus1 = load_data("apiclus1")
+    dclus1 = SurveyDesign(apiclus1; clusters=:dnum, strata=:stype, weights=:pw)
+    @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4
+end
+```
+
+9. In the WSL terminal (not Julia REPL), run `julia dev/test.jl`  
+✅ If you get no errors, your setup is now complete !
+
+You can keep working in the `dev` folder, which is .gitignored.  
+Once you have working code and tests, you can move them to the appropriate folders, commit, push, and submit a Pull Request.  
+Make sure to read the rest of this document so you can learn the best practices and guidelines for this project.  
+
 ## Modifying an existing docstring in `src/`
 
 All docstrings are written inline above the methods or types they are associated with and can

From ad8b1dc01ff6cb66544e4988a4438f0881b05d3d Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Thu, 23 Mar 2023 11:24:26 +0530
Subject: [PATCH 44/63] Update CONTRIBUTING.md

---
 CONTRIBUTING.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dd7aee97..ee5e1acf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,9 +27,9 @@ Reading through the ColPrac guide for collaborative practices is highly recommen
   (`Pkg.add(name="Survey", rev="main")`) is a good gut check and can streamline the process,
   along with including the first two lines of output from `versioninfo()`
 
-## Recommended workflow setup
+## Setting up development workflow
 
-This tutorial assumes you will be using Windows Subsystem for Linux (WSL) and VSCode, which is recommended. 
+Below tutorial uses Windows Subsystem for Linux (WSL) and VSCode. Linux/MacOS/BSD can ignore WSL specific steps.
 
 1. Install Ubuntu on WSL from the [Ubuntu website](https://ubuntu.com/wsl) or the Microsoft Store
 2. Create a fork of the [Survey.jl repository](https://github.com/xKDR/Survey.jl). You will only be ever working on this fork, and submitting Pull Requests to the main repo. 
@@ -48,7 +48,7 @@ This tutorial assumes you will be using Windows Subsystem for Linux (WSL) and VS
     - WSL 
     - Julia
 5. Go back to your WSL terminal, navigate to the folder of your repo, and run `code .` to open VSCode in that folder
-6. Create a `dev` folder, and a `test.jl` file. Paste this block of code and save :
+6. Create a `dev` folder (only if you want, it is gitignored by default), and a `test.jl` file in the file. Paste this block of code and save :
 
 ```julia
 using Revise, Survey, Test
@@ -145,7 +145,7 @@ This way you are modifying as little as possible of previously written code, and
 * If you want to propose a new functionality it is strongly recommended to open an issue first and reach a decision on the final design.
   Then a pull request serves an implementation of the agreed way how things should work.
 * If you are a new contributor and would like to get a guidance on what area
-  you could focus your first PR please do not hesitate to ask and JuliaData members
+  you could focus your first PR please do not hesitate to ask community members
   will help you with picking a topic matching your experience.
 * Feel free to open, or comment on, an issue and solicit feedback early on,
   especially if you're unsure about aligning with design goals and direction,
@@ -155,8 +155,8 @@ This way you are modifying as little as possible of previously written code, and
 * Aim for atomic commits, if possible, e.g. `change 'foo' behavior like so` &
   `'bar' handles such and such corner case`,
   rather than `update 'foo' and 'bar'` & `fix typo` & `fix 'bar' better`.
-* Pull requests are tested against release and development branches of Julia,
-  so using `Pkg.test("DataFrames")` as you develop can be helpful.
+* Pull requests are tested against release branches of Julia,
+  so using `Pkg.test("Survey")` as you develop can be helpful.
 * The style guidelines outlined below are not the personal style of most contributors,
   but for consistency throughout the project, we've adopted them.
 * It is recommended to disable GitHub Actions on your fork; check Settings > Actions.

From 5f662ed93a6e04224b20a50f5f169c1dc4639e39 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Thu, 23 Mar 2023 11:52:16 +0530
Subject: [PATCH 45/63] integrate with mean

---
 src/mean.jl | 40 ++++------------------------------------
 1 file changed, 4 insertions(+), 36 deletions(-)

diff --git a/src/mean.jl b/src/mean.jl
index d13eb116..efde4efd 100644
--- a/src/mean.jl
+++ b/src/mean.jl
@@ -53,44 +53,12 @@ function mean(x::Symbol, design::ReplicateDesign)
             i = 1:design.replicates
         ]
         variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
-    
+        return DataFrame(mean = θ̂, SE = sqrt(variance))
+    # Jackknife integration
     elseif design.type == "jackknife"
-        
-        # The estimate based on original sampling weight vector
-        θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
-        
-        # Find number of psus (nh) in each strata, used inside loop
-        stratified_gdf = groupby(design.data, design.strata)
-        nh = Dict{String, Int}()
-        for (key, subgroup) in pairs(stratified_gdf)
-            nh[key[design.strata]] = length(unique(subgroup[!, design.cluster]))
-        end
-        
-        # iterating over each unique combinations of strata and cluster
-        unique_strata_cols_df = unique(select(design.data, design.strata, design.cluster))
-        variance = 0
-        replicate_index = 1
-        SSE_θ̂hj = 0
-        previous_strata = first(unique_strata_cols_df[!,design.strata])
-        for row in eachrow(unique_strata_cols_df)
-            stratum = row[design.strata]
-            # psu = row[design.cluster]
-            colname = "replicate_"*string(replicate_index)
-            
-            if stratum != previous_strata
-                variance += ((nh[stratum] - 1) / nh[stratum]) * SSE_θ̂hj
-                previous_strata = stratum
-
-                θ̂j = mean(design.data[!, x], weights(design.data[!, colname]))
-                SSE_θ̂hj = (θ̂j - θ̂)^2 
-            else
-                θ̂j = mean(design.data[!, x], weights(design.data[!, colname]))
-                SSE_θ̂hj += (θ̂j - θ̂)^2 
-            end
-            replicate_index += 1
-        end
+        weightedmean(x, y) = mean(x, weights(y))
+        return jackknife_variance(x, weightedmean, design)
     end
-    DataFrame(mean = θ̂, SE = sqrt(variance))
 end
 
 """

From 0985ec1077fb274ef4f2f2ff5f7f58b08549a375 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Thu, 23 Mar 2023 12:34:03 +0530
Subject: [PATCH 46/63] added JK tests srs, strat

---
 test/jackknife.jl | 53 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/test/jackknife.jl b/test/jackknife.jl
index c6527ac8..ce7e685c 100644
--- a/test/jackknife.jl
+++ b/test/jackknife.jl
@@ -1,7 +1,48 @@
 @testset "jackknife" begin
-    dstrat_jk = jackknifeweights(dstrat)
-    mean(:api00,dstrat_jk)
-    mean(:api99,dstrat_jk)
-    # dnhanes_jk = jackknifeweights(dnhanes)
-    # dclus1_jk = jackknifeweights(dclus1)
-end
\ No newline at end of file
+    # Create Jackknife replicate designs
+    bsrs_jk = srs |> jackknifeweights # SRS
+    dstrat_jk = dstrat |> jackknifeweights # Stratified
+    dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster
+    dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified
+    
+    mean(:api00,bsrs_jk).SE[1] ≈ 9.4028 atol = 1e-3 
+    mean([:api99,:api00],dstrat_jk).SE[1] ≈ 10.097  atol = 1e-3 
+    # mean(:api00,dclus2_jk)
+    # mean(,dnhanes_jk)
+end
+
+# # R code for correctness above
+# library(survey) 
+# data(api)
+# apiclus1$pw = rep(757/15,nrow(apiclus1))
+
+# #############
+# ###### 23.03.22 PR#260
+
+# ## srs with fpc (doesnt match Julia)
+# srs <- svydesign(id=~1,data=apisrs, weights=~pw, fpc=~fpc)
+# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
+# svymean(~api00,bsrs_jk)
+# #         mean     SE
+# # api00 656.58 9.2497
+
+# ## srs without fpc (matches Julia)
+# srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc)
+# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
+# svymean(~api00,bsrs_jk)
+# #         mean     SE
+# # api00 656.58 9.4028
+
+# # strat with fpc (doesnt match Julia)
+# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc)
+# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
+# svymean(~api99,dstrat_jk)
+# #         mean     SE
+# # api99 629.39 9.9639
+
+# # strat without fpc (matches Julia)
+# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc)
+# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
+# svymean(~api99,dstrat_jk)
+# #         mean     SE
+# # api99 629.39 10.097
\ No newline at end of file

From a223253c77d1229b1c883c966d78f9f5951ecf69 Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Thu, 23 Mar 2023 12:43:54 +0530
Subject: [PATCH 47/63] improved formatting for srs strat tests

---
 test/jackknife.jl | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/test/jackknife.jl b/test/jackknife.jl
index ce7e685c..a61ae934 100644
--- a/test/jackknife.jl
+++ b/test/jackknife.jl
@@ -5,8 +5,14 @@
     dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster
     dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified
     
-    mean(:api00,bsrs_jk).SE[1] ≈ 9.4028 atol = 1e-3 
-    mean([:api99,:api00],dstrat_jk).SE[1] ≈ 10.097  atol = 1e-3 
+    mean_srs_jk = mean([:api00,:api99], bsrs_jk)
+    @test mean_srs_jk.SE[1] ≈ 9.4028 atol = 1e-3
+    @test mean_srs_jk.SE[2] ≈ 9.6575 atol = 1e-3
+    
+    mean_strat_jk = mean([:api00,:api99],dstrat_jk)
+    @test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3
+    @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3
+    
     # mean(:api00,dclus2_jk)
     # mean(,dnhanes_jk)
 end
@@ -29,9 +35,10 @@ end
 # ## srs without fpc (matches Julia)
 # srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc)
 # bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
-# svymean(~api00,bsrs_jk)
+# svymean(~api00+api99,bsrs_jk)
 # #         mean     SE
 # # api00 656.58 9.4028
+# # api99 624.68 9.6575
 
 # # strat with fpc (doesnt match Julia)
 # dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc)
@@ -43,6 +50,7 @@ end
 # # strat without fpc (matches Julia)
 # dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc)
 # dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
-# svymean(~api99,dstrat_jk)
+# svymean(~api00+api99,dstrat_jk)
 # #         mean     SE
-# # api99 629.39 10.097
\ No newline at end of file
+# # api00 662.29  9.5361
+# # api99 629.39 10.0970
\ No newline at end of file

From d326314043328bdb271b7e4679af5ebcaea7f85a Mon Sep 17 00:00:00 2001
From: smishr <sm_data@outlook.com>
Date: Thu, 23 Mar 2023 13:12:56 +0530
Subject: [PATCH 48/63] dclus2 tests not working

---
 test/jackknife.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/jackknife.jl b/test/jackknife.jl
index a61ae934..49ec0096 100644
--- a/test/jackknife.jl
+++ b/test/jackknife.jl
@@ -13,8 +13,11 @@
     @test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3
     @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3
     
-    # mean(:api00,dclus2_jk)
-    # mean(,dnhanes_jk)
+    mean_clus2_jk = mean([:api00,:api99],dclus2_jk)
+    # @test mean_clus2_jk.SE[1] ≈ XXXXX atol = 1e-3
+    # @test mean_clus2_jk.SE[2] ≈ XXXXX atol = 1e-3
+    
+    # Tests using for NHANES
 end
 
 # # R code for correctness above

From 4f278dac93c07ad58d0f993464008e3ca60a5afe Mon Sep 17 00:00:00 2001
From: Anshul Singhvi <anshulsinghvi@gmail.com>
Date: Fri, 24 Mar 2023 12:50:19 +0530
Subject: [PATCH 49/63] Add tests for total.jl

---
 test/total.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/total.jl b/test/total.jl
index b8d91795..f21cfa81 100644
--- a/test/total.jl
+++ b/test/total.jl
@@ -168,4 +168,13 @@ end
     # equivalent R code (results cause clutter):
     # > svyby(~api00, ~cname, clus1rep, svytotal)
     # > svyby(~api00, ~cname, clus1rep, svymean)
+    
+    # Test that column specifiers from DataFrames make it through this pipeline
+    # These tests replicate what you see above...just with a different syntax.
+    tot = total(:api00, Cols(==(:cname)), dclus1)
+    @test size(tot)[1] == apiclus1.cname |> unique |> length
+    @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL
+    @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL
+    @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL
+    @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL
 end

From 6079b09da0f7eba401a0cc3673fd8d16094d03e8 Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Fri, 24 Mar 2023 14:47:42 +0530
Subject: [PATCH 50/63] Reduce allocations in bootweights

---
 src/bootstrap.jl | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
index 7e01247f..96e98367 100644
--- a/src/bootstrap.jl
+++ b/src/bootstrap.jl
@@ -24,27 +24,17 @@ replicates: 1000
 function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234))
     stratified = groupby(design.data, design.strata)
     H = length(keys(stratified))
-    substrata_dfs = DataFrame[]
+    substrata_dfs = Vector{DataFrame}(undef, H)
     for h = 1:H
         substrata = DataFrame(stratified[h])
         cluster_sorted = sort(substrata, design.cluster)
-        psus = unique(cluster_sorted[!, design.cluster])
-        npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus]
-        nh = length(psus)
+        cluster_sorted_designcluster = cluster_sorted[!, design.cluster]
         cluster_weights = cluster_sorted[!, design.weights]
-        for replicate = 1:replicates
-            randinds = rand(rng, 1:(nh), (nh - 1))
-            cluster_sorted[!, "replicate_"*string(replicate)] =
-                vcat(
-                    [
-                        fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for
-                        i = 1:nh
-                    ]...,
-                ) .* cluster_weights
-        end
-        push!(substrata_dfs, cluster_sorted)
+        _bootweights_cluster_sorted!(cluster_sorted, cluster_weights,
+            cluster_sorted_designcluster, replicates, rng)
+        substrata_dfs[h] = cluster_sorted
     end
-    df = vcat(substrata_dfs...)
+    df = reduce(vcat, substrata_dfs)
     return ReplicateDesign(
         df,
         design.cluster,
@@ -57,3 +47,22 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
         replicates,
     )
 end
+
+function _bootweights_cluster_sorted!(cluster_sorted,
+        cluster_weights, cluster_sorted_designcluster, replicates, rng)
+
+    psus = unique(cluster_sorted_designcluster)
+    npsus = [count(==(i), cluster_sorted_designcluster) for i in psus]
+    nh = length(psus)
+    for replicate = 1:replicates
+        randinds = rand(rng, 1:(nh), (nh - 1))
+        cluster_sorted[!, "replicate_"*string(replicate)] =
+            reduce(vcat,
+                [
+                    fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for
+                    i = 1:nh
+                ]
+            ) .* cluster_weights
+    end
+    cluster_sorted
+end

From b7ac4f2d92706033c60b9f465c84c133abb13ad9 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Tue, 21 Mar 2023 12:34:38 +0530
Subject: [PATCH 51/63] Adding information to docstring of `jackknifeweights`.

---
 src/jackknife.jl | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 5f0e5428..9473c0f5 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -9,10 +9,31 @@ w_{i(hj)} =
 \\begin{cases}
     w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\
     0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\
-    \\dfrac{n_h}{n_h - 1}w_i &\\quad\\quad\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
+    \\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
 \\end{cases}
 ```
-The replicate weight columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. These are added to the underlying `design.data` `DataFrame`.
+In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns.
+
+# Examples
+```jldoctest setup = :(using Survey)
+julia> apistrat = load_data("apistrat");
+
+julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> rstrat = jackknifeweights(dstrat)
+ReplicateDesign:
+data: 200×244 DataFrame
+strata: stype
+    [E, E, E  …  M]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  20.36]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
+type: jackknife
+replicates: 200
+
+```
 
 # Reference
 pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)

From 29250123dbc73dfca4dc344aaf2230e824a0ffc2 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Sat, 25 Mar 2023 17:06:29 +0530
Subject: [PATCH 52/63] Completing docstring for `jackknife_variance`.

---
 src/jackknife.jl | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 9473c0f5..5469d30d 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -15,7 +15,9 @@ w_{i(hj)} =
 In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns.
 
 # Examples
-```jldoctest setup = :(using Survey)
+```jldoctest
+julia> using Survey;
+
 julia> apistrat = load_data("apistrat");
 
 julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
@@ -86,11 +88,47 @@ end
 jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
 ```
 
-Compute variance for the given `func` using the Jackknife method. The formula to compute this variance is the following.
+Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following.
 
 ```math
 \\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2
 ```
+
+Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed.
+
+# Examples
+```jldoctest
+julia> using Survey, StatsBase
+
+julia> apistrat = load_data("apistrat");
+
+julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> rstrat = jackknifeweights(dstrat)
+ReplicateDesign:
+data: 200×244 DataFrame
+strata: stype
+    [E, E, E  …  M]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  20.36]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
+type: jackknife
+replicates: 200
+
+julia> weightedmean(x, y) = mean(x, weights(y));
+
+julia> jackknife_variance(:api00, weightedmean, rstrat)
+1×2 DataFrame
+ Row │ estimator  SE
+     │ Float64    Float64
+─────┼────────────────────
+   1 │   662.287  9.53613
+
+```
+# Reference
+pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
 """
 function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
     df = design.data

From 27ebfed77db5e164a3fcfe11a8e19b20e48b9a4f Mon Sep 17 00:00:00 2001
From: Anshul Singhvi <anshulsinghvi@gmail.com>
Date: Sat, 25 Mar 2023 17:56:46 +0530
Subject: [PATCH 53/63] Update test/total.jl

---
 test/total.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/total.jl b/test/total.jl
index f21cfa81..2138527b 100644
--- a/test/total.jl
+++ b/test/total.jl
@@ -171,7 +171,7 @@ end
     
     # Test that column specifiers from DataFrames make it through this pipeline
     # These tests replicate what you see above...just with a different syntax.
-    tot = total(:api00, Cols(==(:cname)), dclus1)
+    tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1)
     @test size(tot)[1] == apiclus1.cname |> unique |> length
     @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL
     @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL

From c204a14def407255d21355cd46e2acb0d9df8ed8 Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Sun, 26 Mar 2023 09:34:16 +0530
Subject: [PATCH 54/63] Add comment about type-stability

Co-authored-by: Anshul Singhvi <anshulsinghvi@gmail.com>
---
 src/bootstrap.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
index 96e98367..ab8bb0ad 100644
--- a/src/bootstrap.jl
+++ b/src/bootstrap.jl
@@ -30,6 +30,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
         cluster_sorted = sort(substrata, design.cluster)
         cluster_sorted_designcluster = cluster_sorted[!, design.cluster]
         cluster_weights = cluster_sorted[!, design.weights]
+        # Perform the inner loop in a type-stable function to improve runtime.
         _bootweights_cluster_sorted!(cluster_sorted, cluster_weights,
             cluster_sorted_designcluster, replicates, rng)
         substrata_dfs[h] = cluster_sorted

From e9362f9b5821ee6086dd76815a6c3afb42982ac5 Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Mon, 27 Mar 2023 09:54:19 +0530
Subject: [PATCH 55/63] Correct apiclus2 pw colum

---
 test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5f6b748b..048f74bb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -38,7 +38,6 @@ dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters
 
 # Two-stage cluster sample
 apiclus2 = load_data("apiclus2") # Load API dataset
-apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake for pw column
 dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus2_boot = dclus2 |> bootweights # Create replicate design
 

From 8a3d768fe7e3ca8623bbd672655303968bdf8eb3 Mon Sep 17 00:00:00 2001
From: smishr <43640926+smishr@users.noreply.github.com>
Date: Mon, 27 Mar 2023 14:53:08 +0530
Subject: [PATCH 56/63] add tests clus2 nhanes jk

---
 test/jackknife.jl | 35 ++++++++++++++++++++++++++++++++---
 test/runtests.jl  |  2 ++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/test/jackknife.jl b/test/jackknife.jl
index 49ec0096..73ab5b71 100644
--- a/test/jackknife.jl
+++ b/test/jackknife.jl
@@ -14,16 +14,22 @@
     @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3
     
     mean_clus2_jk = mean([:api00,:api99],dclus2_jk)
-    # @test mean_clus2_jk.SE[1] ≈ XXXXX atol = 1e-3
-    # @test mean_clus2_jk.SE[2] ≈ XXXXX atol = 1e-3
+    @test mean_clus2_jk.SE[1] ≈ 34.9388 atol = 1e-3 # R gives 34.928
+    @test mean_clus2_jk.SE[2] ≈ 34.6565 atol = 1e-3 # R gives 34.645
     
     # Tests using for NHANES
+    mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk)
+    @test mean_nhanes_jk.estimator[1] ≈ 21393.96 atol = 1e-3
+    @test mean_nhanes_jk.SE[1] ≈ 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place
+    @test mean_nhanes_jk.estimator[2] ≈ 38508.328 atol = 1e-3
+    @test mean_nhanes_jk.SE[2] ≈ 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place
 end
 
 # # R code for correctness above
 # library(survey) 
 # data(api)
 # apiclus1$pw = rep(757/15,nrow(apiclus1))
+# No corrections needed for apiclus2, it has correct weights by default
 
 # #############
 # ###### 23.03.22 PR#260
@@ -56,4 +62,27 @@ end
 # svymean(~api00+api99,dstrat_jk)
 # #         mean     SE
 # # api00 662.29  9.5361
-# # api99 629.39 10.0970
\ No newline at end of file
+# # api99 629.39 10.0970
+
+#### apiclus2
+# > # clus2 without fpc (doesnt match Julia)
+# > dclus2 <- svydesign(id=~dnum+snum, weights=~pw, data=apiclus2)
+# > dclus2_jk <- as.svrepdesign(dclus2, type="JK1", compress=FALSE)
+# > svymean(~api00+api99,dclus2)
+#         mean     SE
+# api00 670.81 30.712
+# api99 645.03 30.308
+# > svymean(~api00+api99,dclus2_jk)
+#         mean     SE
+# api00 670.81 34.928
+# api99 645.03 34.645
+
+# NHANES test
+# > data("nhanes")
+# > nhanes$seq1 = seq(1.0, 5*8591.0, by = 5)
+# > nhanes$seq2 = seq(1.0, 9*8591.0, by = 9)#,  length.out=8591)
+# > dnhanes <- svydesign(id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=TRUE, data=nhanes)
+# > svymean(~seq1+seq2,dnhanes)
+#       mean     SE
+# seq1 21394 143.34
+# seq2 38508 258.01
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index c3926c5c..d709103c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -43,6 +43,8 @@ dclus2_boot = dclus2 |> bootweights # Create replicate design
 
 # NHANES
 nhanes = load_data("nhanes")
+nhanes.seq1 = collect(1.0:5.0:42955.0)
+nhanes.seq2 = collect(1.0:9.0:77319.0) # [9k for k in 0:8590.0]
 dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR)
 dnhanes_boot = dnhanes |> bootweights
 

From 2162a59c196eeef57afbf943ca83c84571664758 Mon Sep 17 00:00:00 2001
From: smishr <43640926+smishr@users.noreply.github.com>
Date: Tue, 28 Mar 2023 11:11:00 +0530
Subject: [PATCH 57/63] add double domain estimation

---
 test/runtests.jl |  1 -
 test/total.jl    | 16 ++++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5f6b748b..048f74bb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -38,7 +38,6 @@ dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters
 
 # Two-stage cluster sample
 apiclus2 = load_data("apiclus2") # Load API dataset
-apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake for pw column
 dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus2_boot = dclus2 |> bootweights # Create replicate design
 
diff --git a/test/total.jl b/test/total.jl
index 2138527b..1e208397 100644
--- a/test/total.jl
+++ b/test/total.jl
@@ -169,12 +169,16 @@ end
     # > svyby(~api00, ~cname, clus1rep, svytotal)
     # > svyby(~api00, ~cname, clus1rep, svymean)
     
+    # Test multiple domains passed at once
+    tot = total(:api00, [:stype,:cname], dclus1_boot)
+
     # Test that column specifiers from DataFrames make it through this pipeline
     # These tests replicate what you see above...just with a different syntax.
-    tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1)
-    @test size(tot)[1] == apiclus1.cname |> unique |> length
-    @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL
-    @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL
-    @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL
-    @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL
+    tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot)
+    ######## Above Survey.DataFrames.Cols(==(:cname)) syntax doesnt give domain estimates
+    # @test size(tot)[1] == apiclus1.cname |> unique |> length
+    # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL
+    # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL
+    # @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL
+    # @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL
 end

From f9ff9cca61b4d39a0205a4351493f5e560914d46 Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Tue, 28 Mar 2023 15:27:26 +0530
Subject: [PATCH 58/63] Update src/jackknife.jl

Co-authored-by: Anshul Singhvi <anshulsinghvi@gmail.com>
---
 src/jackknife.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 5469d30d..45cef664 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -84,9 +84,7 @@ function jackknifeweights(design::SurveyDesign)
 end
 
 """
-```julia
-jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
-```
+    jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
 
 Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following.
 

From 3daeae6b207d961d9e4d35f25c22ef5623c7a337 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Sat, 1 Apr 2023 00:01:26 +0530
Subject: [PATCH 59/63] Minor change for notation consistency.

---
 src/jackknife.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 45cef664..4dff97d8 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -144,10 +144,10 @@ function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
         cluster_variance = 0
         for psu in psus_in_stratum
             # get replicate weights corresponding to current stratum and psu
-            rep_weights = design.data[!, "replicate_"*string(replicate_index)]
+            rep_weights = df[!, "replicate_"*string(replicate_index)]
 
             # estimator from replicate weights
-            θhj = func(design.data[!, x], rep_weights)
+            θhj = func(df[!, x], rep_weights)
 
             cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
             replicate_index += 1

From 1da3e8f1f005c20cbdf6187f8448a27a304baa54 Mon Sep 17 00:00:00 2001
From: Siddhant Chaudhary <urssidd@gmail.com>
Date: Sat, 1 Apr 2023 00:16:05 +0530
Subject: [PATCH 60/63] Using tab indent in docstring.

---
 src/jackknife.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/jackknife.jl b/src/jackknife.jl
index 4dff97d8..315de348 100644
--- a/src/jackknife.jl
+++ b/src/jackknife.jl
@@ -1,7 +1,6 @@
 """
-```julia
-jackknifeweights(design::SurveyDesign)
-```
+    jackknifeweights(design::SurveyDesign)
+
 Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula.
 
 ```math

From e55225068cf9cc4f92fe2d47675575aa41ea8eae Mon Sep 17 00:00:00 2001
From: Shikhar Mishra <sm_data@outlook.com>
Date: Mon, 10 Apr 2023 10:41:20 +0530
Subject: [PATCH 61/63] Update total.jl

---
 test/total.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/total.jl b/test/total.jl
index 1e208397..fc02b48f 100644
--- a/test/total.jl
+++ b/test/total.jl
@@ -171,10 +171,11 @@ end
     
     # Test multiple domains passed at once
     tot = total(:api00, [:stype,:cname], dclus1_boot)
-
+    
+    #### Why doesnt this syntax produce domain estimates??
     # Test that column specifiers from DataFrames make it through this pipeline
     # These tests replicate what you see above...just with a different syntax.
-    tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot)
+    # tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot)
     ######## Above Survey.DataFrames.Cols(==(:cname)) syntax doesnt give domain estimates
     # @test size(tot)[1] == apiclus1.cname |> unique |> length
     # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL

From 74c72f2d286a8bfec88419cb8a6b449dd1cdf106 Mon Sep 17 00:00:00 2001
From: smishr <43640926+smishr@users.noreply.github.com>
Date: Mon, 10 Apr 2023 11:14:26 +0530
Subject: [PATCH 62/63] Add Vector{Symbol} test Domain est

---
 test/total.jl | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/test/total.jl b/test/total.jl
index fc02b48f..b7780263 100644
--- a/test/total.jl
+++ b/test/total.jl
@@ -171,7 +171,9 @@ end
     
     # Test multiple domains passed at once
     tot = total(:api00, [:stype,:cname], dclus1_boot)
-    
+    @test filter(row -> row[:cname] == "Los Angeles" && row[:stype] == "E", tot).SE[1] ≈ 343365 rtol = STAT_TOL
+    @test filter(row -> row[:cname] == "Merced" && row[:stype] == "H", tot).SE[1] ≈ 27090.33 rtol = STAT_TOL
+
     #### Why doesnt this syntax produce domain estimates??
     # Test that column specifiers from DataFrames make it through this pipeline
     # These tests replicate what you see above...just with a different syntax.
@@ -183,3 +185,42 @@ end
     # @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL
     # @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL
 end
+
+#### R code for vector{Symbol} domain estimation
+# > data(api)
+# > apiclus1$pw = rep(757/15,nrow(apiclus1))
+# > ### 9.04.23
+# > dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1);
+# > rclus1<-as.svrepdesign(dclus1, type="subbootstrap", compress=FALSE, replicates = 4000)
+# > svyby(~api00, ~stype+cname, rclus1, svytotal)
+#               stype       cname      api00         se
+# E.Alameda         E     Alameda  273428.40  275423.33
+# H.Alameda         H     Alameda   30683.73   30907.60
+# M.Alameda         M     Alameda   67272.07   67762.88
+# E.Fresno          E      Fresno   48599.40   47484.67
+# H.Fresno          H      Fresno   22356.73   21843.93
+# M.Fresno          M      Fresno   24324.93   23766.99
+# E.Kern            E        Kern   24930.53   24847.76
+# M.Kern            M        Kern   20741.80   20672.93
+# E.Los Angeles     E Los Angeles  395154.00  341692.92
+# M.Los Angeles     M Los Angeles   94826.87   95416.42
+# E.Mendocino       E   Mendocino   58844.13   57711.15
+# H.Mendocino       H   Mendocino   35124.80   34448.51
+# M.Mendocino       M   Mendocino   31844.47   31231.33
+# E.Merced          E      Merced   50517.13   51424.65
+# H.Merced          H      Merced   26696.87   27176.47
+# M.Merced          M      Merced   27605.27   28101.18
+# E.Orange          E      Orange  463536.33  465047.76
+# M.Orange          M      Orange  110219.20  110578.59
+# E.Plumas          E      Plumas  144284.20  146672.86
+# H.Plumas          H      Plumas  143729.07  146108.54
+# M.Plumas          M      Plumas   34266.87   34834.16
+# E.San Diego       E   San Diego 1670497.13 1233144.04
+# H.San Diego       H   San Diego   63386.13   63693.54
+# M.San Diego       M   San Diego   96492.27   96960.22
+# E.San Joaquin     E San Joaquin  848243.73  848605.33
+# H.San Joaquin     H San Joaquin   79585.93   79619.86
+# M.San Joaquin     M San Joaquin  101387.53  101430.75
+# E.Santa Clara     E Santa Clara  737418.93  484164.71
+# H.Santa Clara     H Santa Clara   35478.07   35311.28
+# M.Santa Clara     M Santa Clara  187685.53  131278.63
\ No newline at end of file

From 8a4bf69cb95e9492bf4d6d0b272e94416e345e16 Mon Sep 17 00:00:00 2001
From: smishr <43640926+smishr@users.noreply.github.com>
Date: Mon, 10 Apr 2023 11:18:21 +0530
Subject: [PATCH 63/63] v0.2

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 792d0adc..7ecf4b9a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Survey"
 uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
 authors = ["Ayush Patnaik <ayushpatnaik@gmail.com>"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"