From 71e760c6aa55814574dbd0cc8234f04951356d1a Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 21 Feb 2023 00:51:18 +0530 Subject: [PATCH 01/63] Update by.jl --- src/by.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/by.jl b/src/by.jl index 3f955b15..f28de3b5 100644 --- a/src/by.jl +++ b/src/by.jl @@ -1,13 +1,12 @@ -function bydomain(x::Symbol, domain::Symbol, design::SurveyDesign, func::Function) +function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function) gdf = groupby(design.data, domain) - nd = length(unique(design.data[!, domain])) X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic) return X end -function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function) +function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function) gdf = groupby(design.data, domain) - nd = length(unique(design.data[!, domain])) + nd = length(gdf) X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic) Xt_mat = Array{Float64,2}(undef, (nd, design.replicates)) for i = 1:design.replicates From 77f967f694417f51aae23d9e2824be5f30bef7a8 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 21 Feb 2023 00:51:57 +0530 Subject: [PATCH 02/63] Update mean.jl --- src/mean.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mean.jl b/src/mean.jl index 4db9fed1..75fef801 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -130,7 +130,7 @@ julia> mean(:api00, :cname, bclus1) 11 │ Mendocino 623.25 1.09545e-13 ``` """ -function mean(x::Symbol, domain::Symbol, design::AbstractSurveyDesign) +function mean(x::Symbol, domain, design::AbstractSurveyDesign) weighted_mean(x, w) = mean(x, StatsBase.weights(w)) df = bydomain(x, domain, design, weighted_mean) rename!(df, :statistic => :mean) From 5a01fe9b22f14092aa9f00c6f4e74f58d0fd46eb Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 21 Feb 2023 00:53:42 +0530 Subject: [PATCH 03/63] Update total.jl --- src/total.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/total.jl b/src/total.jl index cc9bb06d..a451324c 100644 --- a/src/total.jl +++ b/src/total.jl @@ -120,7 +120,7 @@ julia> total(:api00, :cname, bclus1) 11 │ Mendocino 84380.6 80215.9 ``` """ -function total(x::Symbol, domain::Symbol, design::AbstractSurveyDesign) +function total(x::Symbol, domain, design::AbstractSurveyDesign) df = bydomain(x, domain, design, wsum) rename!(df, :statistic => :total) end From 9139e6f7fd09ab46b1237abe1409564750efb8de Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Wed, 22 Feb 2023 14:53:30 +0530 Subject: [PATCH 04/63] Adding a default constructor and a constructor which takes `replicate_weights` for `ReplicateDesign`. --- src/SurveyDesign.jl | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index e0211352..4027c73f 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -157,4 +157,54 @@ struct ReplicateDesign <: AbstractSurveyDesign allprobs::Symbol # Right now only singlestage approx supported pps::Bool replicates::UInt + replicate_weights::Vector{Symbol} + + # default constructor + function ReplicateDesign( + data::DataFrame, + cluster::Symbol, + popsize::Symbol, + sampsize::Symbol, + strata::Symbol, + weights::Symbol, + allprobs::Symbol, + pps::Bool, + replicates::UInt, + replicate_weights::Vector{Symbol} + ) + new(data, cluster, popsize, sampsize, strata, weights, allprobs, + pps, replicates, replicate_weights) + end + + # constructor with given replicate_weights + function ReplicateDesign( + data::AbstractDataFrame; + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, + strata::Union{Nothing,Symbol} = nothing, + popsize::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol} = nothing, + replicates::UInt, + replicate_weights::Vector{Symbol} + ) + # call the SurveyDesign constructor + base_design = SurveyDesign( + data; + clusters=clusters, + strata=strata, + popsize=popsize, + weights=weights + ) + new( + base_design.data, + base_design.cluster, + base_design.popsize, + base_design.sampsize, + base_design.strata, + base_design.weights, + base_design.allprobs, + base_design.pps, + replicates, + replicate_weights + ) + end end From a4f68afad710cdb531580b04163c77c7115a5ba1 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Wed, 22 Feb 2023 14:54:32 +0530 Subject: [PATCH 05/63] Minor change in `bootweights`. --- src/bootstrap.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 7e01247f..2fa02df6 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -54,6 +54,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis design.weights, design.allprobs, design.pps, - replicates, + UInt(replicates), + [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates] ) end From 5ea25251e4f351c544c75800a1c6d1345f7ca68d Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Thu, 23 Feb 2023 00:38:51 +0530 Subject: [PATCH 06/63] Modifying docstring for `ReplicateDesign`. --- src/SurveyDesign.jl | 49 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 4027c73f..54e377e6 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -127,14 +127,32 @@ end """ ReplicateDesign <: AbstractSurveyDesign -Survey design obtained by replicating an original design using [`bootweights`](@ref). +Survey design obtained by replicating an original design using [`bootweights`](@ref). If +replicate weights are available, then they can be used to directly create a `ReplicateDesign`. + +# Constructors + +```julia +ReplicateDesign( + data::AbstractDataFrame; + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, + popsize::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol} = nothing, + replicates::UInt, + replicate_weights::Vector{Symbol} +) +``` + +# Examples + +Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`. ```jldoctest julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); -julia> bootstrat = bootweights(dstrat; replicates=1000) +julia> bootstrat = bootweights(dstrat; replicates=1000) # creating a ReplicateDesign using bootweights ReplicateDesign: data: 200×1044 DataFrame strata: stype @@ -145,7 +163,34 @@ sampsize: [100, 100, 100 … 50] weights: [44.21, 44.21, 44.21 … 15.1] allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 1000 + ``` + +If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in +the above example, suppose we save the `bootstrat` data as a CSV file. + +```jldoctest +julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data); +``` + +We can now pass the replicate weights directly to the `ReplicateDesign` constructor. + +```jldoctest +julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame); + +julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv; strata=:stype, weights=:pw, replicates=UInt(1000), replicate_weights=[Symbol("replicate_"*string(replicate)) for replicate in 1:1000]) +ReplicateDesign: +data: 200×1044 DataFrame +strata: stype + [E, E, E … H] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 15.1] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] +replicates: 1000 +``` + """ struct ReplicateDesign <: AbstractSurveyDesign data::AbstractDataFrame From a6573b1b70fb337764388a5c434560767128e24a Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 24 Feb 2023 14:57:01 +0530 Subject: [PATCH 07/63] Making `replicate_weights` a positional argument in `ReplicateDesign` constructor, and removing the redundant `replicates` argument. --- src/SurveyDesign.jl | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 54e377e6..e0dc6b3b 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -134,12 +134,11 @@ replicate weights are available, then they can be used to directly create a `Rep ```julia ReplicateDesign( - data::AbstractDataFrame; + data::AbstractDataFrame, + replicate_weights::Vector{Symbol}; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, - weights::Union{Nothing,Symbol} = nothing, - replicates::UInt, - replicate_weights::Vector{Symbol} + weights::Union{Nothing,Symbol} = nothing ) ``` @@ -178,7 +177,7 @@ We can now pass the replicate weights directly to the `ReplicateDesign` construc ```jldoctest julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame); -julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv; strata=:stype, weights=:pw, replicates=UInt(1000), replicate_weights=[Symbol("replicate_"*string(replicate)) for replicate in 1:1000]) +julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv, [Symbol("replicate_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw) ReplicateDesign: data: 200×1044 DataFrame strata: stype @@ -223,13 +222,12 @@ struct ReplicateDesign <: AbstractSurveyDesign # constructor with given replicate_weights function ReplicateDesign( - data::AbstractDataFrame; + data::AbstractDataFrame, + replicate_weights::Vector{Symbol}; clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, - weights::Union{Nothing,Symbol} = nothing, - replicates::UInt, - replicate_weights::Vector{Symbol} + weights::Union{Nothing,Symbol} = nothing ) # call the SurveyDesign constructor base_design = SurveyDesign( @@ -248,7 +246,7 @@ struct ReplicateDesign <: AbstractSurveyDesign base_design.weights, base_design.allprobs, base_design.pps, - replicates, + length(replicate_weights), replicate_weights ) end From 3893699fea511ca6fc819f9648ee9af861ec3c26 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 24 Feb 2023 18:14:26 +0530 Subject: [PATCH 08/63] Minor changes to the `ReplicateDesign` docstring. --- src/SurveyDesign.jl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index e0dc6b3b..ebbd4369 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -136,17 +136,24 @@ replicate weights are available, then they can be used to directly create a `Rep ReplicateDesign( data::AbstractDataFrame, replicate_weights::Vector{Symbol}; - clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, strata::Union{Nothing,Symbol} = nothing, + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, + strata::Union{Nothing,Symbol} = nothing, popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing ) ``` +# Arguments + +The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which +must be a `Vector` of `Symbols`. The `Symbol`s should represent the columns of `data` which contain the replicate weights. The +names of these columns must be of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weights. + # Examples Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`. -```jldoctest +```jldoctest replicate-design julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); @@ -166,15 +173,16 @@ replicates: 1000 ``` If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in -the above example, suppose we save the `bootstrat` data as a CSV file. +the above example, suppose we had the `bootstrat` data as a CSV file. -```jldoctest +```jldoctest replicate-design +julia> using CSV; julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data); ``` We can now pass the replicate weights directly to the `ReplicateDesign` constructor. -```jldoctest +```jldoctest replicate-design julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame); julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv, [Symbol("replicate_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw) From 25e12a8184878a4b22abfbdffea5faf2caf618fe Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 24 Feb 2023 18:59:54 +0530 Subject: [PATCH 09/63] Adding missing doc dependencies, and fixing error in docstring for `ReplicateDesign`. --- docs/Project.toml | 4 +++- src/SurveyDesign.jl | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index 76d580a7..f11e8201 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,7 @@ [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c" diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index ebbd4369..b4232ce7 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -153,7 +153,7 @@ names of these columns must be of the form `replicate_i`, where `i` ranges from Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`. -```jldoctest replicate-design +```jldoctest replicate-design; setup = :(using Survey, CSV, DataFrames) julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); @@ -177,7 +177,9 @@ the above example, suppose we had the `bootstrat` data as a CSV file. ```jldoctest replicate-design julia> using CSV; + julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data); + ``` We can now pass the replicate weights directly to the `ReplicateDesign` constructor. @@ -196,6 +198,7 @@ sampsize: [100, 100, 100 … 50] weights: [44.21, 44.21, 44.21 … 15.1] allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] replicates: 1000 + ``` """ From be60c2b5d76a1e8e35baee573eaf7ac672fbb95f Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 24 Feb 2023 22:09:49 +0530 Subject: [PATCH 10/63] Adding tests for `ReplicateDesign` constructor. --- test/SurveyDesign.jl | 14 ++++++++++++++ test/runtests.jl | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 0d42fc19..2c18f48a 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -259,3 +259,17 @@ end yrbs = copy(yrbs_original) dyrbs = SurveyDesign(yrbs; clusters = :psu, strata = :stratum, weights = :weight) end + +@testset "ReplicateDesign_constructor" begin + for (sample, sample_direct) in [(bsrs, bsrs_direct), (bstrat, bstrat_direct), (dclus1_boot, dclus1_boot_direct)] + @test isequal(sample.data, sample_direct.data) + @test isequal(sample.popsize, sample_direct.popsize) + @test isequal(sample.sampsize, sample_direct.sampsize) + @test isequal(sample.strata, sample_direct.strata) + @test isequal(sample.weights, sample_direct.weights) + @test isequal(sample.allprobs, sample_direct.allprobs) + @test isequal(sample.pps, sample_direct.pps) + @test isequal(sample.replicates, sample_direct.replicates) + @test isequal(sample.replicate_weights, sample_direct.replicate_weights) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index f2fc89d4..0485dbaa 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,21 +4,26 @@ using CategoricalArrays const STAT_TOL = 1e-5 const SE_TOL = 1e-1 +const REPLICATE_WEIGHTS = [Symbol("replicate_"*string(i)) for i in 1:4000] # Simple random sample apisrs = load_data("apisrs") # Load API dataset srs = SurveyDesign(apisrs, weights = :pw) bsrs = srs |> bootweights # Create replicate design +bsrs_direct = ReplicateDesign(bsrs.data, REPLICATE_WEIGHTS, weights = :pw) # using ReplicateDesign constructor + # Stratified sample apistrat = load_data("apistrat") # Load API dataset dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign bstrat = dstrat |> bootweights # Create replicate design +bstrat_direct = ReplicateDesign(bstrat.data, REPLICATE_WEIGHTS, strata=:stype, weights=:pw) # using ReplicateDesign constructor # One-stage cluster sample apiclus1 = load_data("apiclus1") # Load API dataset apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake for pw column dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus1_boot = dclus1 |> bootweights # Create replicate design +dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATE_WEIGHTS, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) From 2da49c557813c31f62cd0e82298310f4b6657e39 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 24 Feb 2023 22:54:54 +0530 Subject: [PATCH 11/63] Adding a `type` field to `ReplicateDesign`. --- src/SurveyDesign.jl | 5 ++++- src/bootstrap.jl | 1 + src/show.jl | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index b4232ce7..2feb68e9 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -211,6 +211,7 @@ struct ReplicateDesign <: AbstractSurveyDesign weights::Symbol # Effective weights in case of singlestage approx supported allprobs::Symbol # Right now only singlestage approx supported pps::Bool + type::String replicates::UInt replicate_weights::Vector{Symbol} @@ -224,11 +225,12 @@ struct ReplicateDesign <: AbstractSurveyDesign weights::Symbol, allprobs::Symbol, pps::Bool, + type::String, replicates::UInt, replicate_weights::Vector{Symbol} ) new(data, cluster, popsize, sampsize, strata, weights, allprobs, - pps, replicates, replicate_weights) + pps, type, replicates, replicate_weights) end # constructor with given replicate_weights @@ -257,6 +259,7 @@ struct ReplicateDesign <: AbstractSurveyDesign base_design.weights, base_design.allprobs, base_design.pps, + "bootstrap", length(replicate_weights), replicate_weights ) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 2fa02df6..cc1941e8 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -54,6 +54,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis design.weights, design.allprobs, design.pps, + "bootstrap", UInt(replicates), [Symbol("replicate_"*string(replicate)) for replicate in 1:replicates] ) diff --git a/src/show.jl b/src/show.jl index 2c323704..09d22b48 100644 --- a/src/show.jl +++ b/src/show.jl @@ -37,6 +37,7 @@ Base.show(io::IO, ::MIME"text/plain", design::SurveyDesign) = surveyshow(io, des function Base.show(io::IO, ::MIME"text/plain", design::ReplicateDesign) # new_io = IOContext(io, :compact=>true, :limit=>true, :displaysize=>(50, 50)) surveyshow(io, design) + printinfo(io, "\ntype", design.type; newline = false) printinfo(io, "\nreplicates", design.replicates; newline = false) end From 409228ae0b1229495b1dc2861e4c045483b5b5f9 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 24 Feb 2023 23:08:02 +0530 Subject: [PATCH 12/63] Updating tests for `show.jl`. --- test/show.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/show.jl b/test/show.jl index d21cb76f..3035190f 100644 --- a/test/show.jl +++ b/test/show.jl @@ -23,6 +23,7 @@ sampsize: [200, 200, 200 … 200] weights: [30.97, 30.97, 30.97 … 30.97] allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] + type: bootstrap replicates: 4000""" show(io, MIME("text/plain"), bsrs) @@ -58,6 +59,7 @@ end sampsize: [100, 100, 100 … 50] weights: [44.21, 44.21, 44.21 … 15.1] allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] + type: bootstrap replicates: 4000""" show(io, MIME("text/plain"), bstrat) @@ -93,6 +95,7 @@ end sampsize: [15, 15, 15 … 15] weights: [50.4667, 50.4667, 50.4667 … 50.4667] allprobs: [0.0198, 0.0198, 0.0198 … 0.0198] + type: bootstrap replicates: 4000""" show(io, MIME("text/plain"), dclus1_boot) From d13c288cc32c169cb966919226db5cb464385b71 Mon Sep 17 00:00:00 2001 From: smishr Date: Fri, 24 Feb 2023 23:54:01 +0530 Subject: [PATCH 13/63] [WIP] Updating contribution guidelines --- CONTRIBUTING.md | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ca099c87..9108e095 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -93,9 +93,7 @@ This way you are modifying as little as possible of previously written code, and * Codedev coverage has reached 100%. So please make sure to add testing cases of contributed code to keep it at 100%. * If you want to propose a new functionality it is strongly recommended to open an issue first and reach a decision on the final design. Then a pull request serves an implementation of the agreed way how things should work. -* If you are a new contributor and would like to get a guidance on what area - you could focus your first PR please do not hesitate to ask and JuliaData members - will help you with picking a topic matching your experience. +* If you are a new contributor and would like to get a guidance on what area you could focus your first PR please do not hesitate to ask and JuliaData members will help you with picking a topic matching your experience. * Feel free to open, or comment on, an issue and solicit feedback early on, especially if you're unsure about aligning with design goals and direction, or if relevant historical comments are ambiguous. @@ -104,22 +102,14 @@ This way you are modifying as little as possible of previously written code, and * Aim for atomic commits, if possible, e.g. `change 'foo' behavior like so` & `'bar' handles such and such corner case`, rather than `update 'foo' and 'bar'` & `fix typo` & `fix 'bar' better`. -* Pull requests are tested against release and development branches of Julia, - so using `Pkg.test("DataFrames")` as you develop can be helpful. +* Pull requests are tested against release branches of Julia, so using `Pkg.test("Survey")` as you develop can be helpful. * The style guidelines outlined below are not the personal style of most contributors, but for consistency throughout the project, we've adopted them. -* It is recommended to disable GitHub Actions on your fork; check Settings > Actions. * If a PR adds a new exported name then make sure to add a docstring for it and add a reference to it in the documentation. * A PR with breaking changes should have `[BREAKING]` as a first part of its name. -* If a PR changes or adds functionality please update NEWS.md file accordingly as - a part of the PR (along with the link to the PR); please do not add entries - to NEWS.md for changes that are bug fixes or are not user visible, such as - adding tests, updating documentation or improving code layout. -* If you make a PR please try to avoid pushing many small commits to GitHub in - a sequence as each such commit triggers a separate CI job, which takes over - an hour. This has a consequence of making other PRs in packages from the JuliaData - ecosystem wait for such CI jobs to finish as hey share a common pool of CI resources. +* A PR which is still draft or work in progress should have `WIP:` as a first part of its name. +* If you make a PR please try to avoid pushing many small commits to GitHub in a sequence as each such commit triggers a separate CI job, which takes compuational time, and not a good use of the small pool of CI resources. ## Style Guidelines From 3c0f68a2f7b9eb6225468c2b7599a704cc94bcdd Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Sat, 25 Feb 2023 01:43:16 +0530 Subject: [PATCH 14/63] Fixing minor errors in documentation. --- README.md | 5 +++-- src/SurveyDesign.jl | 2 ++ src/bootstrap.jl | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 16aaac65..c2c673c5 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,8 @@ cluster: none popsize: [6190.0, 6190.0, 6190.0 … 6190.0] sampsize: [200, 200, 200 … 200] weights: [31.0, 31.0, 31.0 … 31.0] -probs: [0.0323, 0.0323, 0.0323 … 0.0323] +allprobs: [0.0323, 0.0323, 0.0323 … 0.0323] +type: bootstrap replicates: 1000 julia> mean(:api00, bootsrs) @@ -171,4 +172,4 @@ We gratefully acknowledge the JuliaLab at MIT for financial support for this pro ## References -[^1]: [Lumley, Thomas. Complex surveys: a guide to analysis using R. John Wiley & Sons, 2011.](https://books.google.co.in/books?hl=en&lr=&id=L96ludyhFBsC&oi=fnd&pg=PP12&dq=complex+surveys+lumley&ots=ie0y1lnzv1&sig=c4UHI3arjspMJ6OYzlX32E9rNRI#v=onepage&q=complex%20surveys%20lumley&f=false) Page 44 \ No newline at end of file +[^1]: [Lumley, Thomas. Complex surveys: a guide to analysis using R. John Wiley & Sons, 2011.](https://books.google.co.in/books?hl=en&lr=&id=L96ludyhFBsC&oi=fnd&pg=PP12&dq=complex+surveys+lumley&ots=ie0y1lnzv1&sig=c4UHI3arjspMJ6OYzlX32E9rNRI#v=onepage&q=complex%20surveys%20lumley&f=false) Page 44 diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 2feb68e9..43acee38 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -168,6 +168,7 @@ popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] sampsize: [100, 100, 100 … 50] weights: [44.21, 44.21, 44.21 … 15.1] allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] +type: bootstrap replicates: 1000 ``` @@ -197,6 +198,7 @@ popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] sampsize: [100, 100, 100 … 50] weights: [44.21, 44.21, 44.21 … 15.1] allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] +type: bootstrap replicates: 1000 ``` diff --git a/src/bootstrap.jl b/src/bootstrap.jl index cc1941e8..b789783c 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -18,6 +18,7 @@ popsize: [757, 757, 757 … 757] sampsize: [15, 15, 15 … 15] weights: [50.4667, 50.4667, 50.4667 … 50.4667] allprobs: [0.0198, 0.0198, 0.0198 … 0.0198] +type: bootstrap replicates: 1000 ``` """ From 5efee45a62f34ba27ddc6219d58d6b81de74be8e Mon Sep 17 00:00:00 2001 From: smishr Date: Sat, 25 Feb 2023 16:48:12 +0530 Subject: [PATCH 15/63] add jk1 and jkn stratified --- src/Survey.jl | 2 ++ src/jackknife.jl | 82 +++++++++++++++++++++++++++++++++++++++++++++++ test/jackknife.jl | 4 +++ test/runtests.jl | 1 + 4 files changed, 89 insertions(+) create mode 100644 src/jackknife.jl create mode 100644 test/jackknife.jl diff --git a/src/Survey.jl b/src/Survey.jl index 62f263a2..884453c6 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -25,6 +25,7 @@ include("boxplot.jl") include("show.jl") include("ratio.jl") include("by.jl") +include("jackknife.jl") export load_data export AbstractSurveyDesign, SurveyDesign, ReplicateDesign @@ -35,5 +36,6 @@ export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio +export jk1weights, jackknifeweights end diff --git a/src/jackknife.jl b/src/jackknife.jl new file mode 100644 index 00000000..43e54f2e --- /dev/null +++ b/src/jackknife.jl @@ -0,0 +1,82 @@ +""" + Delete 1 jackknife with no/single stratum + + ## Reference + pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +""" +function jk1weights(design::SurveyDesign) + cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting? + psus = unique(cluster_sorted[!, design.cluster]) + nh = length(psus) + npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] + replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] + cluster_weights = cluster_sorted[!, design.weights] + for (i,replicate) in enumerate(replicates) + cluster_sorted[!, "replicate_"*string(i)] = vcat( + [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh + ]..., + ) .* cluster_weights + end + return cluster_sorted +end + +""" + WIP: Delete-1 jackknife algorithm for replicate weights columns + + ## Reference + pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +""" +function jackknifeweights(design::SurveyDesign) + stratified = groupby(design.data, design.strata) + H = length(keys(stratified)) + @show H + substrata_dfs = DataFrame[] + for h in 1:H + @show h + substrata = DataFrame(stratified[h]) + cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting? + psus = unique(cluster_sorted[!, design.cluster]) + # cluster_weights = substrata[!, design.weights] + # gdf = groupby(substrata, design.cluster) + # psus = unique(substrata[!, design.cluster]) + nh = length(psus) + npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] + replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] + @show nh, psus, npsus + # @show replicates + cluster_weights = cluster_sorted[!, design.weights] + @show size(cluster_weights) + for (i,replicate) in enumerate(replicates) + tmp = vcat( + [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh + ]..., + ) .* cluster_weights + @show tmp + cluster_sorted[!, "replicate_"*string(i)] = tmp + + end + push!(substrata_dfs, cluster_sorted) + return vcat(substrata_dfs...) + end +end + # push!(cluster_weights, [(count(==(i), replicate)) for i in 1:nh].*(nh/(nh-1)) )# main jackknife algo. + # for j in 1:nh + # clusterweight = clusterweights[j] + # for i in 1:nh + # gdf[i][!,"replicate_"*string(j)] = clusterweight[i].*DataFrame(gdf[i]).weights + # end + # end + # stratified[h].replicate_1 = DataFrame(gdf).replicate_1 + + # df = vcat(substrata_dfs...) + # return ReplicateDesign( + # df, + # design.cluster, + # design.popsize, + # design.sampsize, + # design.strata, + # design.weights, + # design.allprobs, + # design.pps, + # replicates, + # ) diff --git a/test/jackknife.jl b/test/jackknife.jl new file mode 100644 index 00000000..7578998f --- /dev/null +++ b/test/jackknife.jl @@ -0,0 +1,4 @@ +@testset "jackknife" begin + dclus1_jack = dclus1 |> jackknifeweights + dclus1_jk1 = dclus1 |> jk1weights # Create replicate design +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index f2fc89d4..d7e8572e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -35,3 +35,4 @@ include("hist.jl") include("boxplot.jl") include("ratio.jl") include("show.jl") +# include("jackknife.jl") \ No newline at end of file From 7ef490e0f77b273900c25d9c1e81f34277e8910f Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 26 Feb 2023 12:29:19 +0530 Subject: [PATCH 16/63] return ReplicateDesign class --- src/jackknife.jl | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 43e54f2e..a48daac6 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -1,6 +1,9 @@ """ - Delete 1 jackknife with no/single stratum + Delete 1 jackknife with for unstratified designs. + Replicate weights are given by: + wi(hj) = 0, if observation unit i is in psu j of stratum h + = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. ## Reference pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ @@ -17,11 +20,26 @@ function jk1weights(design::SurveyDesign) ]..., ) .* cluster_weights end - return cluster_sorted + return ReplicateDesign( + cluster_sorted, + design.cluster, + design.popsize, + design.sampsize, + design.strata, + design.weights, + design.allprobs, + design.pps, + length(replicates), + ) end """ WIP: Delete-1 jackknife algorithm for replicate weights columns + + Replicate weights are given by: + wi(hj) = wi, if observation unit i is not in stratum h + = 0, if observation unit i is in psu j of stratum h + = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. ## Reference pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) From 6668e042f55df635fa4e14cd15d55f1e156b0d3b Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 26 Feb 2023 17:04:29 +0530 Subject: [PATCH 17/63] simple jkn algo, add estimator --- src/jackknife.jl | 115 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 30 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index a48daac6..17bc97b3 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -33,6 +33,22 @@ function jk1weights(design::SurveyDesign) ) end +""" + Jackknife estimator + + Uses jackknife replicate weights to calculate variance of estimator θ̂ +""" +function JackknifeEstimator(θ̂::Float64,design::ReplicateDesign) + gdf = groupby(design.data,design.strata) + # @show keys(gdf) + calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf) + calc_df[!,:multiplier] = (calc_df[!,:x1] .- 1 ) ./ calc_df[!,:x1] + # for each_gdf in gdf + # @show nrow(each_gdf) + # end + return calc_df +end + """ WIP: Delete-1 jackknife algorithm for replicate weights columns @@ -45,38 +61,77 @@ end pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ function jackknifeweights(design::SurveyDesign) - stratified = groupby(design.data, design.strata) - H = length(keys(stratified)) - @show H - substrata_dfs = DataFrame[] - for h in 1:H - @show h - substrata = DataFrame(stratified[h]) - cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting? - psus = unique(cluster_sorted[!, design.cluster]) - # cluster_weights = substrata[!, design.weights] - # gdf = groupby(substrata, design.cluster) - # psus = unique(substrata[!, design.cluster]) - nh = length(psus) - npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] - replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] - @show nh, psus, npsus - # @show replicates - cluster_weights = cluster_sorted[!, design.weights] - @show size(cluster_weights) - for (i,replicate) in enumerate(replicates) - tmp = vcat( - [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh - ]..., - ) .* cluster_weights - @show tmp - cluster_sorted[!, "replicate_"*string(i)] = tmp - - end - push!(substrata_dfs, cluster_sorted) - return vcat(substrata_dfs...) + df = design.data + ## Find number of psus (nh) in each strata, used inside loop + stratified = groupby(df, design.strata) + nh_df = combine(x -> length(unique(x[!,design.cluster])), gdf) + + # Iterate over each combination of strata X cluster + unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) + counter = 1 + for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2]) + colname = "replicate_"*string(counter) + @show colname, strata, psu + ###### three mutually exclusive cases based on strata and psu + + ### if observation unit i is not in stratum h + not_in_strata = df[df[!,design.strata] .!= strata,:] |> nrow + # Set replicate weights at these indices to: wi + + ### if observation unit i is in psu j of stratum h + in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:] + # Set replicate weights at these indices to: 0 + + ### if observation unit i is in stratum h but not in psu j. + in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:] + # Set replicate weights at these indices to: (nh-1)/nh + counter += 1 end + return end + + # @show unique_strata_cols_df + # stratas = unique(df[!,design.strata]) + # psus = unique(df[!,design.cluster]) + # for i in 1:I + + # if design.data[i,design.weights] + # design.data[i,colname] = + # end +# sort!(design.data, (design.strata,design.cluster)) + # stratified = groupby(design.data, design.strata) + # H = length(keys(stratified)) + # @show H + # calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf).x1 + # substrata_dfs = DataFrame[] + # for h in 1:H + # @show h + # substrata = DataFrame(stratified[h]) + # cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting? + # psus = unique(cluster_sorted[!, design.cluster]) + # # cluster_weights = substrata[!, design.weights] + # # gdf = groupby(substrata, design.cluster) + # # psus = unique(substrata[!, design.cluster]) + # nh = length(psus) + # npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] + # replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] + # @show nh, psus, npsus + # # @show replicates + # cluster_weights = cluster_sorted[!, design.weights] + # @show size(cluster_weights) + # for (i,replicate) in enumerate(replicates) + # tmp = vcat( + # [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh + # ]..., + # ) .* cluster_weights + # @show tmp + # cluster_sorted[!, "replicate_"*string(i)] = tmp + + # end + # push!(substrata_dfs, cluster_sorted) + # return vcat(substrata_dfs...) + # end + # push!(cluster_weights, [(count(==(i), replicate)) for i in 1:nh].*(nh/(nh-1)) )# main jackknife algo. # for j in 1:nh # clusterweight = clusterweights[j] From cdf943021ce06e587269e7b180e5611a532ac5d4 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 26 Feb 2023 17:04:53 +0530 Subject: [PATCH 18/63] Add nhanes to runtests --- src/Survey.jl | 2 +- test/jackknife.jl | 6 ++++-- test/runtests.jl | 5 +++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/Survey.jl b/src/Survey.jl index 884453c6..91a0eb81 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -36,6 +36,6 @@ export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio -export jk1weights, jackknifeweights +export jk1weights, JackknifeEstimator, jackknifeweights end diff --git a/test/jackknife.jl b/test/jackknife.jl index 7578998f..b33bd25b 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -1,4 +1,6 @@ @testset "jackknife" begin - dclus1_jack = dclus1 |> jackknifeweights + dclus1_jack = dnhanes |> jackknifeweights dclus1_jk1 = dclus1 |> jk1weights # Create replicate design -end \ No newline at end of file +end +dclus1_jk1 = dclus1 |> jk1weights +dclus1_jack = dnhanes |> jackknifeweights \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index d7e8572e..28bd8cae 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,6 +20,11 @@ apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake fo dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus1_boot = dclus1 |> bootweights # Create replicate design +# NHANES +nhanes = load_data("nhanes") +dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) +dnhanes_boot = dnhanes |> bootweights + @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) @test size(load_data("apiclus2")) == (126, 41) From 92becd4244a624460d0e2a51e0f5ee1b24d244d9 Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 26 Feb 2023 17:34:47 +0530 Subject: [PATCH 19/63] nh-1 multiplier --- src/jackknife.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 17bc97b3..3a5411a2 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -65,6 +65,7 @@ function jackknifeweights(design::SurveyDesign) ## Find number of psus (nh) in each strata, used inside loop stratified = groupby(df, design.strata) nh_df = combine(x -> length(unique(x[!,design.cluster])), gdf) + nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) # Iterate over each combination of strata X cluster unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) @@ -75,7 +76,7 @@ function jackknifeweights(design::SurveyDesign) ###### three mutually exclusive cases based on strata and psu ### if observation unit i is not in stratum h - not_in_strata = df[df[!,design.strata] .!= strata,:] |> nrow + not_in_strata = df[df[!,design.strata] .!= strata,:] # Set replicate weights at these indices to: wi ### if observation unit i is in psu j of stratum h @@ -84,7 +85,8 @@ function jackknifeweights(design::SurveyDesign) ### if observation unit i is in stratum h but not in psu j. in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:] - # Set replicate weights at these indices to: (nh-1)/nh + # Set replicate weights at these indices to: nh/(nh-1) + counter += 1 end return From d36581b6ef7234f3e074cece01dff740ab25d1fb Mon Sep 17 00:00:00 2001 From: smishr Date: Sun, 26 Feb 2023 17:36:06 +0530 Subject: [PATCH 20/63] fix comment --- src/jackknife.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 3a5411a2..11f340a0 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -77,15 +77,15 @@ function jackknifeweights(design::SurveyDesign) ### if observation unit i is not in stratum h not_in_strata = df[df[!,design.strata] .!= strata,:] - # Set replicate weights at these indices to: wi + # Set replicate weight at these indices to: wi ### if observation unit i is in psu j of stratum h in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:] - # Set replicate weights at these indices to: 0 + # Set replicate weight at these indices to: 0 ### if observation unit i is in stratum h but not in psu j. in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:] - # Set replicate weights at these indices to: nh/(nh-1) + # Set replicate weight at these indices to: nh/(nh-1) * wi counter += 1 end From 7e61dd17f25ee2c13901dda48b65101959b553dd Mon Sep 17 00:00:00 2001 From: smishr Date: Mon, 27 Feb 2023 19:58:21 +0530 Subject: [PATCH 21/63] add wi to jknife --- src/jackknife.jl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 11f340a0..9b97c076 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -63,23 +63,29 @@ end function jackknifeweights(design::SurveyDesign) df = design.data ## Find number of psus (nh) in each strata, used inside loop - stratified = groupby(df, design.strata) - nh_df = combine(x -> length(unique(x[!,design.cluster])), gdf) + stratified_gdf = groupby(df, design.strata) + nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf) nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) # Iterate over each combination of strata X cluster unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) counter = 1 + wᵢ = df[:,design.weights] for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2]) colname = "replicate_"*string(counter) @show colname, strata, psu + + # Initialise replicate_i with original sampling weights + df[!,colname] = wᵢ + ###### three mutually exclusive cases based on strata and psu - ### if observation unit i is not in stratum h - not_in_strata = df[df[!,design.strata] .!= strata,:] - # Set replicate weight at these indices to: wi - + ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata) + # not_in_strata = df[df[!,design.strata] .!= strata,:] + # Keep replicate weight at these indices to: wi + ### if observation unit i is in psu j of stratum h + # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu) in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:] # Set replicate weight at these indices to: 0 From 0a656e4d04e5824e39d5dff9b936f40dce9122b3 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 28 Feb 2023 01:29:58 +0530 Subject: [PATCH 22/63] Changing the names of `replicate_weights` to the form `replicate_i`. --- src/SurveyDesign.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 43acee38..e616eaf3 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -244,6 +244,9 @@ struct ReplicateDesign <: AbstractSurveyDesign popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing ) + # rename the replicate weights if needed + rename!(data, [replicate_weights[index] => "replicate_"*string(index) for index in 1:length(replicate_weights)]) + # call the SurveyDesign constructor base_design = SurveyDesign( data; From 7fa80ee7876a6bf34fa338d903a238dd7e9b36c2 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 28 Feb 2023 01:50:31 +0530 Subject: [PATCH 23/63] Adding constructor for `ReplicateDesign` which takes a `UnitRange`. --- src/SurveyDesign.jl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index e616eaf3..f574bcc4 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -269,4 +269,22 @@ struct ReplicateDesign <: AbstractSurveyDesign replicate_weights ) end + + # replicate weights given as a range of columns + ReplicateDesign( + data::AbstractDataFrame, + replicate_weights::UnitRange{Int}; + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, + strata::Union{Nothing,Symbol} = nothing, + popsize::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol} = nothing + ) = + ReplicateDesign( + data, + Symbol.(names(data)[replicate_weights]); + clusters=clusters, + strata=strata, + popsize=popsize, + weights=weights + ) end From 648f1a5813704ce046423424d761c4266997d9ae Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 28 Feb 2023 02:02:17 +0530 Subject: [PATCH 24/63] Adding constructor for `ReplicateDesign` which takes a regular expression. --- src/SurveyDesign.jl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index f574bcc4..c499d869 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -287,4 +287,22 @@ struct ReplicateDesign <: AbstractSurveyDesign popsize=popsize, weights=weights ) + + # replicate weights given as regular expression + ReplicateDesign( + data::AbstractDataFrame, + replicate_weights::Regex; + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, + strata::Union{Nothing,Symbol} = nothing, + popsize::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol} = nothing + ) = + ReplicateDesign( + data, + Symbol.(names(data)[findall(name -> occursin(replicate_weights, name), names(data))]); + clusters=clusters, + strata=strata, + popsize=popsize, + weights=weights + ) end From a88c51bdfd2710bd1c4e2ca02eea2bfda9592575 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 28 Feb 2023 16:52:18 +0530 Subject: [PATCH 25/63] Adding docs for new constructors of `ReplicateDesign`. --- src/SurveyDesign.jl | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index c499d869..21904c0a 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -141,13 +141,36 @@ ReplicateDesign( popsize::Union{Nothing,Symbol} = nothing, weights::Union{Nothing,Symbol} = nothing ) + +ReplicateDesign( + data::AbstractDataFrame, + replicate_weights::UnitIndex{Int}; + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, + strata::Union{Nothing,Symbol} = nothing, + popsize::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol} = nothing +) + +ReplicateDesign( + data::AbstractDataFrame, + replicate_weights::Regex; + clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing, + strata::Union{Nothing,Symbol} = nothing, + popsize::Union{Nothing,Symbol} = nothing, + weights::Union{Nothing,Symbol} = nothing +) ``` # Arguments -The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which -must be a `Vector` of `Symbols`. The `Symbol`s should represent the columns of `data` which contain the replicate weights. The -names of these columns must be of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weights. +The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can +be of one of the following types. + +- `Vector{Symbol}`: In this case, each `Symbol` in the vector should represent a column of `data` containing the replicate weights. +- `UnitIndex{Int}`: For instance, this could be UnitRange(5:10). This will mean that the replicate weights are contained in columns 5 through 10. +- `Regex`: In this case, all the columns of `data` which match this `Regex` will be treated as the columns containing the replicate weights. + +All the columns containing the replicate weights will be renamed to the form `replicate_i`, where `i` ranges from 1 to the number of columns containing the replicate weights. # Examples @@ -174,21 +197,21 @@ replicates: 1000 ``` If the replicate weights are given to us already, then we can directly pass them to the `ReplicateDesign` constructor. For instance, in -the above example, suppose we had the `bootstrat` data as a CSV file. +the above example, suppose we had the `bootstrat` data as a CSV file (for this example, we also rename the columns containing the replicate weights to the form `r_i`). ```jldoctest replicate-design julia> using CSV; +julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000]) + julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data); ``` -We can now pass the replicate weights directly to the `ReplicateDesign` constructor. +We can now pass the replicate weights directly to the `ReplicateDesign` constructor, either as a `Vector{Symbol}`, a `UnitRange` or a `Regex`. ```jldoctest replicate-design -julia> apistrat_fromcsv = CSV.read("apistrat_withreplicates.csv", DataFrame); - -julia> bootstrat_direct = ReplicateDesign(apistrat_fromcsv, [Symbol("replicate_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw) +julia> bootstrat_direct = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw) ReplicateDesign: data: 200×1044 DataFrame strata: stype @@ -201,6 +224,10 @@ allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] type: bootstrap replicates: 1000 +julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw) + +julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw) + ``` """ From 9ce6a7379c1f9b508d3dbcfb7ec6736f84a5a584 Mon Sep 17 00:00:00 2001 From: Debartha Paul Date: Tue, 28 Feb 2023 22:51:25 +0530 Subject: [PATCH 26/63] Added apiclus2 for runtests --- test/runtests.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index f2fc89d4..084a0317 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,6 +20,12 @@ apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake fo dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus1_boot = dclus1 |> bootweights # Create replicate design +# Two-stage cluster sample +apiclus2 = load_data("apiclus2") # Load API dataset +apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake for pw column +dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign +dclus2_boot = dclus2 |> bootweights # Create replicate design + @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) @test size(load_data("apiclus2")) == (126, 41) From a7c6f80b5e32ffe4cb34c2754c84521f00d21dae Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Wed, 1 Mar 2023 00:22:31 +0530 Subject: [PATCH 27/63] Adding tests for the new `ReplicateDesign` constructors. --- test/SurveyDesign.jl | 30 +++++++++++++++++++++++++++++- test/runtests.jl | 19 +++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 2c18f48a..5084c1e4 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -260,7 +260,7 @@ end dyrbs = SurveyDesign(yrbs; clusters = :psu, strata = :stratum, weights = :weight) end -@testset "ReplicateDesign_constructor" begin +@testset "ReplicateDesign_direct" begin for (sample, sample_direct) in [(bsrs, bsrs_direct), (bstrat, bstrat_direct), (dclus1_boot, dclus1_boot_direct)] @test isequal(sample.data, sample_direct.data) @test isequal(sample.popsize, sample_direct.popsize) @@ -273,3 +273,31 @@ end @test isequal(sample.replicate_weights, sample_direct.replicate_weights) end end + +@testset "ReplicateDesign_unitrange" begin + for (sample, sample_unitrange) in [(bsrs, bsrs_unitrange), (bstrat, bstrat_unitrange), (dclus1_boot, dclus1_boot_unitrange)] + @test isequal(sample.data, sample_unitrange.data) + @test isequal(sample.popsize, sample_unitrange.popsize) + @test isequal(sample.sampsize, sample_unitrange.sampsize) + @test isequal(sample.strata, sample_unitrange.strata) + @test isequal(sample.weights, sample_unitrange.weights) + @test isequal(sample.allprobs, sample_unitrange.allprobs) + @test isequal(sample.pps, sample_unitrange.pps) + @test isequal(sample.replicates, sample_unitrange.replicates) + @test isequal(sample.replicate_weights, sample_unitrange.replicate_weights) + end +end + +@testset "ReplicateDesign_regex" begin + for (sample, sample_regex) in [(bsrs, bsrs_regex), (bstrat, bstrat_regex), (dclus1_boot, dclus1_boot_regex)] + @test isequal(sample.data, sample_regex.data) + @test isequal(sample.popsize, sample_regex.popsize) + @test isequal(sample.sampsize, sample_regex.sampsize) + @test isequal(sample.strata, sample_regex.strata) + @test isequal(sample.weights, sample_regex.weights) + @test isequal(sample.allprobs, sample_regex.allprobs) + @test isequal(sample.pps, sample_regex.pps) + @test isequal(sample.replicates, sample_regex.replicates) + @test isequal(sample.replicate_weights, sample_regex.replicate_weights) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 0485dbaa..61d6637b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,26 +4,37 @@ using CategoricalArrays const STAT_TOL = 1e-5 const SE_TOL = 1e-1 -const REPLICATE_WEIGHTS = [Symbol("replicate_"*string(i)) for i in 1:4000] +TOTAL_REPLICATES = 4000 +REPLICATES_VECTOR = [Symbol("replicate_"*string(i)) for i in 1:TOTAL_REPLICATES] +REPLICATES_REGEX = r"r*_\d" # Simple random sample apisrs = load_data("apisrs") # Load API dataset srs = SurveyDesign(apisrs, weights = :pw) +unitrange = UnitRange((length(names(apisrs)) + 1):(TOTAL_REPLICATES + length(names(apisrs)))) bsrs = srs |> bootweights # Create replicate design -bsrs_direct = ReplicateDesign(bsrs.data, REPLICATE_WEIGHTS, weights = :pw) # using ReplicateDesign constructor +bsrs_direct = ReplicateDesign(bsrs.data, REPLICATES_VECTOR, weights = :pw) # using ReplicateDesign constructor +bsrs_unitrange = ReplicateDesign(bsrs.data, unitrange, weights = :pw) # using ReplicateDesign constructor +bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw) # using ReplicateDesign constructor # Stratified sample apistrat = load_data("apistrat") # Load API dataset dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign +unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat)))) bstrat = dstrat |> bootweights # Create replicate design -bstrat_direct = ReplicateDesign(bstrat.data, REPLICATE_WEIGHTS, strata=:stype, weights=:pw) # using ReplicateDesign constructor +bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw) # using ReplicateDesign constructor +bstrat_unitrange = ReplicateDesign(bstrat.data, unitrange, strata=:stype, weights=:pw) # using ReplicateDesign constructor +bstrat_regex = ReplicateDesign(bstrat.data, REPLICATES_REGEX, strata=:stype, weights=:pw) # using ReplicateDesign constructor # One-stage cluster sample apiclus1 = load_data("apiclus1") # Load API dataset apiclus1[!, :pw] = fill(757 / 15, (size(apiclus1, 1),)) # Correct api mistake for pw column dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw) # Create SurveyDesign +unitrange = UnitRange((length(names(apiclus1)) + 1):(TOTAL_REPLICATES + length(names(apiclus1)))) dclus1_boot = dclus1 |> bootweights # Create replicate design -dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATE_WEIGHTS, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor +dclus1_boot_direct = ReplicateDesign(dclus1_boot.data, REPLICATES_VECTOR, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor +dclus1_boot_unitrange = ReplicateDesign(dclus1_boot.data, unitrange, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor +dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters=:dnum, weights=:pw) # using ReplicateDesign constructor @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) From 327dd14b7aa91f04252a1ffb6993017f2e02adeb Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Wed, 1 Mar 2023 00:48:23 +0530 Subject: [PATCH 28/63] Fixing doctests in `ReplicateDesign`. --- src/SurveyDesign.jl | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 21904c0a..09d986a1 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -202,7 +202,7 @@ the above example, suppose we had the `bootstrat` data as a CSV file (for this e ```jldoctest replicate-design julia> using CSV; -julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000]) +julia> DataFrames.rename!(bootstrat.data, ["replicate_"*string(index) => "r_"*string(index) for index in 1:1000]); julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data); @@ -225,8 +225,30 @@ type: bootstrap replicates: 1000 julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw) +ReplicateDesign: +data: 200×1044 DataFrame +strata: stype + [E, E, E … H] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 15.1] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] +type: bootstrap +replicates: 1000 julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw) +ReplicateDesign: +data: 200×1044 DataFrame +strata: stype + [E, E, E … H] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 755.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 15.1] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0662] +type: bootstrap +replicates: 1000 ``` From 36f6dc8dde0a6742dcba12a223ff344f6f413c55 Mon Sep 17 00:00:00 2001 From: Debartha Paul Date: Tue, 7 Mar 2023 12:31:57 +0530 Subject: [PATCH 29/63] Added tests for ratio Tests for ratio estimation of stratified sampling --- test/ratio.jl | 3 +++ test/runtests.jl | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/test/ratio.jl b/test/ratio.jl index d0f5f975..f3326e1e 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -2,4 +2,7 @@ @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1 @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4 + @test ratio(:api00, :enroll, dstrat).ratio[1] ≈ 1.11256 atol = 1e-4 + @test ratio(:api00, :enroll, dstrat_boot).SE[1] ≈ 0.04185 atol = 1e-1 + @test ratio(:api00, :enroll, dstrat_boot).ratio[1] ≈ 1.11256 atol = 1e-4 end diff --git a/test/runtests.jl b/test/runtests.jl index 5f6b748b..064c3b1c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -42,6 +42,11 @@ apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake fo dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus2_boot = dclus2 |> bootweights # Create replicate design +# Stratified sample +apistrat = load_data("apistrat") # Load API dataset +dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw, popsize = :fpc) # Create SurveyDesign +dstrat_boot = dstrat |> bootweights # Create replicate design + @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) @test size(load_data("apiclus2")) == (126, 41) From 26a1e912fa663fc9dfd1bf2832b24e7065e3dde6 Mon Sep 17 00:00:00 2001 From: Debartha Paul Date: Wed, 8 Mar 2023 15:40:15 +0530 Subject: [PATCH 30/63] Modified stratified design Removed finite population correction in `dstrat` --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 064c3b1c..1da4a1a5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -44,7 +44,7 @@ dclus2_boot = dclus2 |> bootweights # Create replicate design # Stratified sample apistrat = load_data("apistrat") # Load API dataset -dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw, popsize = :fpc) # Create SurveyDesign +dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw) # Create SurveyDesign dstrat_boot = dstrat |> bootweights # Create replicate design @testset "Survey.jl" begin From 46268021320e6000c2b7765c81a1b8e67d4d68f9 Mon Sep 17 00:00:00 2001 From: Debartha Paul Date: Fri, 10 Mar 2023 08:30:10 +0530 Subject: [PATCH 31/63] Update test/runtests.jl Remove recurring definitions for `apistrat` Co-authored-by: Ayush Patnaik --- test/runtests.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 1da4a1a5..5f6b748b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -42,11 +42,6 @@ apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake fo dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus2_boot = dclus2 |> bootweights # Create replicate design -# Stratified sample -apistrat = load_data("apistrat") # Load API dataset -dstrat = SurveyDesign(apistrat; strata = :stype, weights = :pw) # Create SurveyDesign -dstrat_boot = dstrat |> bootweights # Create replicate design - @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) @test size(load_data("apiclus2")) == (126, 41) From 424bde1c24e92090a058bbebed6a4ec1b6e4a459 Mon Sep 17 00:00:00 2001 From: Debartha Paul Date: Fri, 10 Mar 2023 08:55:09 +0530 Subject: [PATCH 32/63] Added replicate design for stratified --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index 5f6b748b..e06effc7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,6 +20,7 @@ bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw) # usin # Stratified sample apistrat = load_data("apistrat") # Load API dataset dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign +dstrat_boot = dstrat |> bootweights # Create replicate design unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat)))) bstrat = dstrat |> bootweights # Create replicate design bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw) # using ReplicateDesign constructor From 4c2dd2146f12615cc4057dfc6c2d702cb1ffb026 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 10 Mar 2023 15:27:52 +0530 Subject: [PATCH 33/63] Adding `jackknifeweights` implementation. --- src/jackknife.jl | 183 +++++++++++++++++++++++++++++------------------ 1 file changed, 115 insertions(+), 68 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 9b97c076..2607a0a5 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -1,37 +1,37 @@ -""" - Delete 1 jackknife with for unstratified designs. +# """ +# Delete 1 jackknife with for unstratified designs. - Replicate weights are given by: - wi(hj) = 0, if observation unit i is in psu j of stratum h - = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. - ## Reference - pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) -""" -function jk1weights(design::SurveyDesign) - cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting? - psus = unique(cluster_sorted[!, design.cluster]) - nh = length(psus) - npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] - replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] - cluster_weights = cluster_sorted[!, design.weights] - for (i,replicate) in enumerate(replicates) - cluster_sorted[!, "replicate_"*string(i)] = vcat( - [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh - ]..., - ) .* cluster_weights - end - return ReplicateDesign( - cluster_sorted, - design.cluster, - design.popsize, - design.sampsize, - design.strata, - design.weights, - design.allprobs, - design.pps, - length(replicates), - ) -end +# Replicate weights are given by: +# wi(hj) = 0, if observation unit i is in psu j of stratum h +# = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. +# ## Reference +# pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +# """ +# function jk1weights(design::SurveyDesign) +# cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting? +# psus = unique(cluster_sorted[!, design.cluster]) +# nh = length(psus) +# npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] +# replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] +# cluster_weights = cluster_sorted[!, design.weights] +# for (i,replicate) in enumerate(replicates) +# cluster_sorted[!, "replicate_"*string(i)] = vcat( +# [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh +# ]..., +# ) .* cluster_weights +# end +# return ReplicateDesign( +# cluster_sorted, +# design.cluster, +# design.popsize, +# design.sampsize, +# design.strata, +# design.weights, +# design.allprobs, +# design.pps, +# length(replicates), +# ) +# end """ Jackknife estimator @@ -49,53 +49,100 @@ function JackknifeEstimator(θ̂::Float64,design::ReplicateDesign) return calc_df end -""" - WIP: Delete-1 jackknife algorithm for replicate weights columns +# """ +# WIP: Delete-1 jackknife algorithm for replicate weights columns - Replicate weights are given by: - wi(hj) = wi, if observation unit i is not in stratum h - = 0, if observation unit i is in psu j of stratum h - = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. +# Replicate weights are given by: +# wi(hj) = wi, if observation unit i is not in stratum h +# = 0, if observation unit i is in psu j of stratum h +# = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. + +# ## Reference +# pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +# """ +# function jackknifeweights(design::SurveyDesign) +# df = design.data +# ## Find number of psus (nh) in each strata, used inside loop +# stratified_gdf = groupby(df, design.strata) +# nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf) +# nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) + +# # Iterate over each combination of strata X cluster +# unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) +# counter = 1 +# wᵢ = df[:,design.weights] +# for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2]) +# colname = "replicate_"*string(counter) +# @show colname, strata, psu + +# # Initialise replicate_i with original sampling weights +# df[!,colname] = wᵢ + +# ###### three mutually exclusive cases based on strata and psu + +# ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata) +# # not_in_strata = df[df[!,design.strata] .!= strata,:] +# # Keep replicate weight at these indices to: wi + +# ### if observation unit i is in psu j of stratum h +# # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu) +# in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:] +# # Set replicate weight at these indices to: 0 + +# ### if observation unit i is in stratum h but not in psu j. +# in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:] +# # Set replicate weight at these indices to: nh/(nh-1) * wi + +# counter += 1 +# end +# return +# end - ## Reference - pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) -""" function jackknifeweights(design::SurveyDesign) df = design.data - ## Find number of psus (nh) in each strata, used inside loop + + # Find number of psus (nh) in each strata, used inside loop stratified_gdf = groupby(df, design.strata) - nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf) - nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) - - # Iterate over each combination of strata X cluster + nh = Dict{String, Int}() + for (key, subgroup) in pairs(stratified_gdf) + nh[key[design.strata]] = length(unique(subgroup[!, design.cluster])) + end + + # iterating over each unique combinations of strata and cluster unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) - counter = 1 - wᵢ = df[:,design.weights] - for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2]) - colname = "replicate_"*string(counter) - @show colname, strata, psu + replicate_index = 1 + for row in eachrow(unique_strata_cols_df) + stratum = row[design.strata] + psu = row[design.cluster] + colname = "replicate_"*string(replicate_index) # Initialise replicate_i with original sampling weights - df[!,colname] = wᵢ + df[!, colname] = Vector(df[!, design.weights]) - ###### three mutually exclusive cases based on strata and psu + # getting indexes + same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu) + different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu) - ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata) - # not_in_strata = df[df[!,design.strata] .!= strata,:] - # Keep replicate weight at these indices to: wi + # scaling weights appropriately + df[same_psu, colname] .*= 0 + df[different_psu, colname] .*= nh[stratum]/(nh[stratum] - 1) - ### if observation unit i is in psu j of stratum h - # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu) - in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:] - # Set replicate weight at these indices to: 0 - - ### if observation unit i is in stratum h but not in psu j. - in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:] - # Set replicate weight at these indices to: nh/(nh-1) * wi - - counter += 1 + replicate_index += 1 end - return + + return ReplicateDesign( + df, + design.cluster, + design.popsize, + design.sampsize, + design.strata, + design.weights, + design.allprobs, + design.pps, + "jackknife", + UInt(DataFrames.nrow(unique_strata_cols_df)), + [Symbol("replicate_"*string(index)) for index in 1:DataFrames.nrow(unique_strata_cols_df)] + ) end # @show unique_strata_cols_df From 876fa7fd62b2ca52d0f89404f7f5c60d39aa7550 Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Tue, 14 Mar 2023 13:41:46 +0530 Subject: [PATCH 34/63] Update test/runtests.jl Co-authored-by: Ayush Patnaik --- test/runtests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index e06effc7..5f6b748b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,7 +20,6 @@ bsrs_regex = ReplicateDesign(bsrs.data, REPLICATES_REGEX, weights = :pw) # usin # Stratified sample apistrat = load_data("apistrat") # Load API dataset dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) # Create SurveyDesign -dstrat_boot = dstrat |> bootweights # Create replicate design unitrange = UnitRange((length(names(apistrat)) + 1):(TOTAL_REPLICATES + length(names(apistrat)))) bstrat = dstrat |> bootweights # Create replicate design bstrat_direct = ReplicateDesign(bstrat.data, REPLICATES_VECTOR, strata=:stype, weights=:pw) # using ReplicateDesign constructor From 13e358a54127875ac686ac8d17dfa2142d4accc4 Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Tue, 14 Mar 2023 13:42:35 +0530 Subject: [PATCH 35/63] Update ratio.jl change to bstrat --- test/ratio.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/ratio.jl b/test/ratio.jl index f3326e1e..3536a169 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -2,7 +2,7 @@ @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1 @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4 - @test ratio(:api00, :enroll, dstrat).ratio[1] ≈ 1.11256 atol = 1e-4 - @test ratio(:api00, :enroll, dstrat_boot).SE[1] ≈ 0.04185 atol = 1e-1 - @test ratio(:api00, :enroll, dstrat_boot).ratio[1] ≈ 1.11256 atol = 1e-4 + @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4 + @test ratio(:api00, :enroll, bstrat).SE[1] ≈ 0.04185 atol = 1e-1 + @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4 end From 246ff130be637549eb2f7b8f68abd3c2edd683f7 Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Tue, 14 Mar 2023 13:47:58 +0530 Subject: [PATCH 36/63] Update ratio.jl combining #279 into this PR --- test/ratio.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/ratio.jl b/test/ratio.jl index 3536a169..b9865f09 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -1,8 +1,14 @@ @testset "ratio.jl" begin + # on :api00 + @test ratio(:api00, :enroll, srs).ratio[1] ≈ 1.1231 atol = 1e-4 @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1 @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4 @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4 @test ratio(:api00, :enroll, bstrat).SE[1] ≈ 0.04185 atol = 1e-1 @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4 + # on :api99 + @test ratio(:api99, :enroll, bsrs).ratio[1] ≈ 1.06854 atol = 1e-4 + @test ratio(:api99, :enroll, dstrat).ratio[1] ≈ 1.0573 atol = 1e-4 + @test ratio(:api99, :enroll, bstrat).ratio[1] ≈ 1.0573 atol = 1e-4 end From 02136a551883dd69e146cdc0506952957a3d1fd6 Mon Sep 17 00:00:00 2001 From: smishr Date: Tue, 14 Mar 2023 14:41:59 +0530 Subject: [PATCH 37/63] remove commented code jkknife. rm jk1 only function --- src/jackknife.jl | 171 +++-------------------------------------------- 1 file changed, 9 insertions(+), 162 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 2607a0a5..1c15d676 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -1,103 +1,14 @@ -# """ -# Delete 1 jackknife with for unstratified designs. - -# Replicate weights are given by: -# wi(hj) = 0, if observation unit i is in psu j of stratum h -# = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. -# ## Reference -# pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) -# """ -# function jk1weights(design::SurveyDesign) -# cluster_sorted = sort(design.data, design.cluster) # Does jk need sorting? -# psus = unique(cluster_sorted[!, design.cluster]) -# nh = length(psus) -# npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] -# replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] -# cluster_weights = cluster_sorted[!, design.weights] -# for (i,replicate) in enumerate(replicates) -# cluster_sorted[!, "replicate_"*string(i)] = vcat( -# [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh -# ]..., -# ) .* cluster_weights -# end -# return ReplicateDesign( -# cluster_sorted, -# design.cluster, -# design.popsize, -# design.sampsize, -# design.strata, -# design.weights, -# design.allprobs, -# design.pps, -# length(replicates), -# ) -# end - -""" - Jackknife estimator - - Uses jackknife replicate weights to calculate variance of estimator θ̂ """ -function JackknifeEstimator(θ̂::Float64,design::ReplicateDesign) - gdf = groupby(design.data,design.strata) - # @show keys(gdf) - calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf) - calc_df[!,:multiplier] = (calc_df[!,:x1] .- 1 ) ./ calc_df[!,:x1] - # for each_gdf in gdf - # @show nrow(each_gdf) - # end - return calc_df -end - -# """ -# WIP: Delete-1 jackknife algorithm for replicate weights columns - -# Replicate weights are given by: -# wi(hj) = wi, if observation unit i is not in stratum h -# = 0, if observation unit i is in psu j of stratum h -# = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. - -# ## Reference -# pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) -# """ -# function jackknifeweights(design::SurveyDesign) -# df = design.data -# ## Find number of psus (nh) in each strata, used inside loop -# stratified_gdf = groupby(df, design.strata) -# nh_df = combine(x -> length(unique(x[!,design.cluster])), stratified_gdf) -# nh_df[!,:multiplier] = nh_df[!,:x1] ./ (nh_df[!,:x1] .- 1 ) + Delete-1 Jackknife algorithm for replication weights from sampling weights -# # Iterate over each combination of strata X cluster -# unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) -# counter = 1 -# wᵢ = df[:,design.weights] -# for (strata,psu) in zip(unique_strata_cols_df[!,1],unique_strata_cols_df[!,2]) -# colname = "replicate_"*string(counter) -# @show colname, strata, psu - -# # Initialise replicate_i with original sampling weights -# df[!,colname] = wᵢ - -# ###### three mutually exclusive cases based on strata and psu - -# ### if observation unit i is not in stratum h (usually most of the observations if evenly divided strata) -# # not_in_strata = df[df[!,design.strata] .!= strata,:] -# # Keep replicate weight at these indices to: wi - -# ### if observation unit i is in psu j of stratum h -# # return (df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu) -# in_strata_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .== psu),:] -# # Set replicate weight at these indices to: 0 - -# ### if observation unit i is in stratum h but not in psu j. -# in_strata_not_psu = df[(df[!,design.strata] .== strata) .&& (df[!,design.cluster] .!= psu),:] -# # Set replicate weight at these indices to: nh/(nh-1) * wi - -# counter += 1 -# end -# return -# end + Replicate weights are given by: + wi(hj) = wi, if observation unit i is not in stratum h + = 0, if observation unit i is in psu j of stratum h + = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. + ## Reference + pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +""" function jackknifeweights(design::SurveyDesign) df = design.data @@ -143,68 +54,4 @@ function jackknifeweights(design::SurveyDesign) UInt(DataFrames.nrow(unique_strata_cols_df)), [Symbol("replicate_"*string(index)) for index in 1:DataFrames.nrow(unique_strata_cols_df)] ) -end - - # @show unique_strata_cols_df - # stratas = unique(df[!,design.strata]) - # psus = unique(df[!,design.cluster]) - # for i in 1:I - - # if design.data[i,design.weights] - # design.data[i,colname] = - # end -# sort!(design.data, (design.strata,design.cluster)) - # stratified = groupby(design.data, design.strata) - # H = length(keys(stratified)) - # @show H - # calc_df = combine(x -> length(unique(x[!,design.cluster])), gdf).x1 - # substrata_dfs = DataFrame[] - # for h in 1:H - # @show h - # substrata = DataFrame(stratified[h]) - # cluster_sorted = sort(substrata, design.cluster) # Does jk need sorting? - # psus = unique(cluster_sorted[!, design.cluster]) - # # cluster_weights = substrata[!, design.weights] - # # gdf = groupby(substrata, design.cluster) - # # psus = unique(substrata[!, design.cluster]) - # nh = length(psus) - # npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] - # replicates = [filter(n -> n != i, 1:nh) for i in 1:nh] - # @show nh, psus, npsus - # # @show replicates - # cluster_weights = cluster_sorted[!, design.weights] - # @show size(cluster_weights) - # for (i,replicate) in enumerate(replicates) - # tmp = vcat( - # [ fill((count(==(i), replicate)) .* (nh / (nh - 1)), npsus[i]) for i = 1:nh - # ]..., - # ) .* cluster_weights - # @show tmp - # cluster_sorted[!, "replicate_"*string(i)] = tmp - - # end - # push!(substrata_dfs, cluster_sorted) - # return vcat(substrata_dfs...) - # end - - # push!(cluster_weights, [(count(==(i), replicate)) for i in 1:nh].*(nh/(nh-1)) )# main jackknife algo. - # for j in 1:nh - # clusterweight = clusterweights[j] - # for i in 1:nh - # gdf[i][!,"replicate_"*string(j)] = clusterweight[i].*DataFrame(gdf[i]).weights - # end - # end - # stratified[h].replicate_1 = DataFrame(gdf).replicate_1 - - # df = vcat(substrata_dfs...) - # return ReplicateDesign( - # df, - # design.cluster, - # design.popsize, - # design.sampsize, - # design.strata, - # design.weights, - # design.allprobs, - # design.pps, - # replicates, - # ) +end \ No newline at end of file From 22338c2607d944adb29308d614407e5405f3ae2f Mon Sep 17 00:00:00 2001 From: smishr Date: Tue, 14 Mar 2023 14:43:55 +0530 Subject: [PATCH 38/63] rm exports --- src/Survey.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Survey.jl b/src/Survey.jl index 91a0eb81..1be5b203 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -36,6 +36,6 @@ export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio -export jk1weights, JackknifeEstimator, jackknifeweights +export jackknifeweights end From 6aa8466c36f418e60a906da722758fac449d13b3 Mon Sep 17 00:00:00 2001 From: smishr Date: Tue, 14 Mar 2023 16:20:34 +0530 Subject: [PATCH 39/63] Sorting Replicatedesign jackknife for now --- src/jackknife.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/jackknife.jl b/src/jackknife.jl index 1c15d676..eefde9d6 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -10,6 +10,7 @@ pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ function jackknifeweights(design::SurveyDesign) + sort!(design.data, [design.strata, design.cluster]) df = design.data # Find number of psus (nh) in each strata, used inside loop From 3ab7ccca8b8865c815e7c5e30aec40c8ba22ae41 Mon Sep 17 00:00:00 2001 From: smishr Date: Tue, 14 Mar 2023 16:21:29 +0530 Subject: [PATCH 40/63] Modify jk mean. Start jk tests --- src/mean.jl | 52 ++++++++++++++++++++++++++++++++++++++++------- test/jackknife.jl | 11 +++++----- test/runtests.jl | 2 +- 3 files changed, 52 insertions(+), 13 deletions(-) diff --git a/src/mean.jl b/src/mean.jl index 4db9fed1..d13eb116 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -46,13 +46,51 @@ julia> mean(:api00, bclus1) ``` """ function mean(x::Symbol, design::ReplicateDesign) - X = mean(design.data[!, x], weights(design.data[!, design.weights])) - Xt = [ - mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for - i = 1:design.replicates - ] - variance = sum((Xt .- X) .^ 2) / design.replicates - DataFrame(mean = X, SE = sqrt(variance)) + if design.type == "bootstrap" + θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights])) + θ̂t = [ + mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for + i = 1:design.replicates + ] + variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates + + elseif design.type == "jackknife" + + # The estimate based on original sampling weight vector + θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights])) + + # Find number of psus (nh) in each strata, used inside loop + stratified_gdf = groupby(design.data, design.strata) + nh = Dict{String, Int}() + for (key, subgroup) in pairs(stratified_gdf) + nh[key[design.strata]] = length(unique(subgroup[!, design.cluster])) + end + + # iterating over each unique combinations of strata and cluster + unique_strata_cols_df = unique(select(design.data, design.strata, design.cluster)) + variance = 0 + replicate_index = 1 + SSE_θ̂hj = 0 + previous_strata = first(unique_strata_cols_df[!,design.strata]) + for row in eachrow(unique_strata_cols_df) + stratum = row[design.strata] + # psu = row[design.cluster] + colname = "replicate_"*string(replicate_index) + + if stratum != previous_strata + variance += ((nh[stratum] - 1) / nh[stratum]) * SSE_θ̂hj + previous_strata = stratum + + θ̂j = mean(design.data[!, x], weights(design.data[!, colname])) + SSE_θ̂hj = (θ̂j - θ̂)^2 + else + θ̂j = mean(design.data[!, x], weights(design.data[!, colname])) + SSE_θ̂hj += (θ̂j - θ̂)^2 + end + replicate_index += 1 + end + end + DataFrame(mean = θ̂, SE = sqrt(variance)) end """ diff --git a/test/jackknife.jl b/test/jackknife.jl index b33bd25b..c6527ac8 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -1,6 +1,7 @@ @testset "jackknife" begin - dclus1_jack = dnhanes |> jackknifeweights - dclus1_jk1 = dclus1 |> jk1weights # Create replicate design -end -dclus1_jk1 = dclus1 |> jk1weights -dclus1_jack = dnhanes |> jackknifeweights \ No newline at end of file + dstrat_jk = jackknifeweights(dstrat) + mean(:api00,dstrat_jk) + mean(:api99,dstrat_jk) + # dnhanes_jk = jackknifeweights(dnhanes) + # dclus1_jk = jackknifeweights(dclus1) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 31f0fa84..bf3acbf3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -62,4 +62,4 @@ include("hist.jl") include("boxplot.jl") include("ratio.jl") include("show.jl") -# include("jackknife.jl") \ No newline at end of file +include("jackknife.jl") \ No newline at end of file From 6071a43b9d6a3756ae9697b2cc829ddb3de36d0f Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Fri, 17 Mar 2023 22:37:18 +0530 Subject: [PATCH 41/63] Adding `jackknife_variance` implementation. Also shortening code for `jackknifeweights`. --- src/Survey.jl | 2 +- src/jackknife.jl | 74 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/src/Survey.jl b/src/Survey.jl index 1be5b203..6960cfb5 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -36,6 +36,6 @@ export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio -export jackknifeweights +export jackknifeweights, jackknife_variance end diff --git a/src/jackknife.jl b/src/jackknife.jl index eefde9d6..c9e90be2 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -13,33 +13,28 @@ function jackknifeweights(design::SurveyDesign) sort!(design.data, [design.strata, design.cluster]) df = design.data - # Find number of psus (nh) in each strata, used inside loop stratified_gdf = groupby(df, design.strata) - nh = Dict{String, Int}() + replicate_index = 0 for (key, subgroup) in pairs(stratified_gdf) - nh[key[design.strata]] = length(unique(subgroup[!, design.cluster])) - end - - # iterating over each unique combinations of strata and cluster - unique_strata_cols_df = unique(select(df, design.strata, design.cluster)) - replicate_index = 1 - for row in eachrow(unique_strata_cols_df) - stratum = row[design.strata] - psu = row[design.cluster] - colname = "replicate_"*string(replicate_index) + stratum = key[design.strata] # current stratum + psus_in_stratum = unique(subgroup[!, design.cluster]) + nh = length(psus_in_stratum) - # Initialise replicate_i with original sampling weights - df[!, colname] = Vector(df[!, design.weights]) + for psu in psus_in_stratum + replicate_index += 1 + colname = "replicate_"*string(replicate_index) - # getting indexes - same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu) - different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu) + # Initialise replicate_i with original sampling weights + df[!, colname] = Vector(df[!, design.weights]) - # scaling weights appropriately - df[same_psu, colname] .*= 0 - df[different_psu, colname] .*= nh[stratum]/(nh[stratum] - 1) + # getting indexes + same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu) + different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu) - replicate_index += 1 + # scaling weights appropriately + df[same_psu, colname] .*= 0 + df[different_psu, colname] .*= nh/(nh - 1) + end end return ReplicateDesign( @@ -52,7 +47,38 @@ function jackknifeweights(design::SurveyDesign) design.allprobs, design.pps, "jackknife", - UInt(DataFrames.nrow(unique_strata_cols_df)), - [Symbol("replicate_"*string(index)) for index in 1:DataFrames.nrow(unique_strata_cols_df)] + UInt(replicate_index), + [Symbol("replicate_"*string(index)) for index in 1:replicate_index] ) -end \ No newline at end of file +end + +function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) + df = design.data + # sort!(df, [design.strata, design.cluster]) + stratified_gdf = groupby(df, design.strata) + + # estimator from original weights + θ = func(df[!, x], df[!, design.weights]) + + variance = 0 + replicate_index = 1 + for subgroup in stratified_gdf + psus_in_stratum = unique(subgroup[!, design.cluster]) + nh = length(psus_in_stratum) + cluster_variance = 0 + for psu in psus_in_stratum + # get replicate weights corresponding to current stratum and psu + rep_weights = design.data[!, "replicate_"*string(replicate_index)] + + # estimator from replicate weights + θhj = func(design.data[!, x], rep_weights) + + cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2 + replicate_index += 1 + end + variance += cluster_variance + end + + return DataFrame(estimator = θ, SE = sqrt(variance)) +end + From 4f49122b7c955f9d631e63c3674210d30b501d00 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Sat, 18 Mar 2023 01:09:47 +0530 Subject: [PATCH 42/63] Adding preliminary docstrings for `jackknifeweights` and `jackknife_variance`. --- docs/src/api.md | 2 ++ src/jackknife.jl | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 341dcbba..e3dd4f08 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -14,6 +14,8 @@ SurveyDesign ReplicateDesign load_data bootweights +jackknifeweights +jackknife_variance mean total quantile diff --git a/src/jackknife.jl b/src/jackknife.jl index c9e90be2..5f0e5428 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -1,13 +1,21 @@ """ - Delete-1 Jackknife algorithm for replication weights from sampling weights - - Replicate weights are given by: - wi(hj) = wi, if observation unit i is not in stratum h - = 0, if observation unit i is in psu j of stratum h - = nh(nh −1)wi, if observation unit i is in stratum h but not in psu j. - - ## Reference - pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +```julia +jackknifeweights(design::SurveyDesign) +``` +Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula. + +```math +w_{i(hj)} = +\\begin{cases} + w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\ + 0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\ + \\dfrac{n_h}{n_h - 1}w_i &\\quad\\quad\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\ +\\end{cases} +``` +The replicate weight columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. These are added to the underlying `design.data` `DataFrame`. + +# Reference +pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ function jackknifeweights(design::SurveyDesign) sort!(design.data, [design.strata, design.cluster]) @@ -52,6 +60,17 @@ function jackknifeweights(design::SurveyDesign) ) end +""" +```julia +jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) +``` + +Compute variance for the given `func` using the Jackknife method. The formula to compute this variance is the following. + +```math +\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2 +``` +""" function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) df = design.data # sort!(df, [design.strata, design.cluster]) From 10e7fb8355eefb85a5411fedc1c92afb9796546f Mon Sep 17 00:00:00 2001 From: nadiaenh Date: Sun, 19 Mar 2023 02:58:20 -0700 Subject: [PATCH 43/63] added instructions for WSL + table of contents --- CONTRIBUTING.md | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ca099c87..dd7aee97 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,6 +2,17 @@ # Contributing to Survey.jl + * [Overview](#overview) + * [Reporting Issues](#reporting-issues) + * [Recommended workflow setup](#recommended-workflow-setup) + * [Modifying an existing docstring in `src/`](#modifying-an-existing-docstring-in--src--) + * [Adding a new docstring to `src/`](#adding-a-new-docstring-to--src--) + * [Doctests](#doctests) + * [Integration with exisiting API](#integration-with-exisiting-api) + * [Contributing](#contributing) + * [Style Guidelines](#style-guidelines) + * [Git Recommendations For Pull Requests](#git-recommendations-for-pull-requests) + ## Overview Thank you for thinking about making contributions to Survey.jl! We aim to keep consistency in contribution guidelines to DataFrames.jl, which is the main upstream dependency for the project. @@ -16,6 +27,46 @@ Reading through the ColPrac guide for collaborative practices is highly recommen (`Pkg.add(name="Survey", rev="main")`) is a good gut check and can streamline the process, along with including the first two lines of output from `versioninfo()` +## Recommended workflow setup + +This tutorial assumes you will be using Windows Subsystem for Linux (WSL) and VSCode, which is recommended. + +1. Install Ubuntu on WSL from the [Ubuntu website](https://ubuntu.com/wsl) or the Microsoft Store +2. Create a fork of the [Survey.jl repository](https://github.com/xKDR/Survey.jl). You will only be ever working on this fork, and submitting Pull Requests to the main repo. +3. Copy the SSH link from your fork by clicking the green `<> Code` icon and then `SSH`. + - You must already have SSH setup for this to work. If you don't, look up a tutorial on how to clone a github repository using SSH. +4. Open a WSL terminal, and run : + - `curl -fsSL https://install.julialang.org | sh` + - `git clone git@github.com:your_username/Survey.jl.git` -- replace "*your_username**" + - `julia` +3. You are now in the Julia REPL, run : + - `import Pkg; Pkg.add("Revise")` + - `import Pkg; Pkg.add("Survey")` + - `import Pkg; Pkg.add("Test")` + - `] dev .` +4. Open VSCode and install the following extensions : + - WSL + - Julia +5. Go back to your WSL terminal, navigate to the folder of your repo, and run `code .` to open VSCode in that folder +6. Create a `dev` folder, and a `test.jl` file. Paste this block of code and save : + +```julia +using Revise, Survey, Test + +@testset "ratio.jl" begin + apiclus1 = load_data("apiclus1") + dclus1 = SurveyDesign(apiclus1; clusters=:dnum, strata=:stype, weights=:pw) + @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 +end +``` + +9. In the WSL terminal (not Julia REPL), run `julia dev/test.jl` +✅ If you get no errors, your setup is now complete ! + +You can keep working in the `dev` folder, which is .gitignored. +Once you have working code and tests, you can move them to the appropriate folders, commit, push, and submit a Pull Request. +Make sure to read the rest of this document so you can learn the best practices and guidelines for this project. + ## Modifying an existing docstring in `src/` All docstrings are written inline above the methods or types they are associated with and can From ad8b1dc01ff6cb66544e4988a4438f0881b05d3d Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Thu, 23 Mar 2023 11:24:26 +0530 Subject: [PATCH 44/63] Update CONTRIBUTING.md --- CONTRIBUTING.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dd7aee97..ee5e1acf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,9 +27,9 @@ Reading through the ColPrac guide for collaborative practices is highly recommen (`Pkg.add(name="Survey", rev="main")`) is a good gut check and can streamline the process, along with including the first two lines of output from `versioninfo()` -## Recommended workflow setup +## Setting up development workflow -This tutorial assumes you will be using Windows Subsystem for Linux (WSL) and VSCode, which is recommended. +Below tutorial uses Windows Subsystem for Linux (WSL) and VSCode. Linux/MacOS/BSD can ignore WSL specific steps. 1. Install Ubuntu on WSL from the [Ubuntu website](https://ubuntu.com/wsl) or the Microsoft Store 2. Create a fork of the [Survey.jl repository](https://github.com/xKDR/Survey.jl). You will only be ever working on this fork, and submitting Pull Requests to the main repo. @@ -48,7 +48,7 @@ This tutorial assumes you will be using Windows Subsystem for Linux (WSL) and VS - WSL - Julia 5. Go back to your WSL terminal, navigate to the folder of your repo, and run `code .` to open VSCode in that folder -6. Create a `dev` folder, and a `test.jl` file. Paste this block of code and save : +6. Create a `dev` folder (only if you want, it is gitignored by default), and a `test.jl` file in the file. Paste this block of code and save : ```julia using Revise, Survey, Test @@ -145,7 +145,7 @@ This way you are modifying as little as possible of previously written code, and * If you want to propose a new functionality it is strongly recommended to open an issue first and reach a decision on the final design. Then a pull request serves an implementation of the agreed way how things should work. * If you are a new contributor and would like to get a guidance on what area - you could focus your first PR please do not hesitate to ask and JuliaData members + you could focus your first PR please do not hesitate to ask community members will help you with picking a topic matching your experience. * Feel free to open, or comment on, an issue and solicit feedback early on, especially if you're unsure about aligning with design goals and direction, @@ -155,8 +155,8 @@ This way you are modifying as little as possible of previously written code, and * Aim for atomic commits, if possible, e.g. `change 'foo' behavior like so` & `'bar' handles such and such corner case`, rather than `update 'foo' and 'bar'` & `fix typo` & `fix 'bar' better`. -* Pull requests are tested against release and development branches of Julia, - so using `Pkg.test("DataFrames")` as you develop can be helpful. +* Pull requests are tested against release branches of Julia, + so using `Pkg.test("Survey")` as you develop can be helpful. * The style guidelines outlined below are not the personal style of most contributors, but for consistency throughout the project, we've adopted them. * It is recommended to disable GitHub Actions on your fork; check Settings > Actions. From 5f662ed93a6e04224b20a50f5f169c1dc4639e39 Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 23 Mar 2023 11:52:16 +0530 Subject: [PATCH 45/63] integrate with mean --- src/mean.jl | 40 ++++------------------------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/src/mean.jl b/src/mean.jl index d13eb116..efde4efd 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -53,44 +53,12 @@ function mean(x::Symbol, design::ReplicateDesign) i = 1:design.replicates ] variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates - + return DataFrame(mean = θ̂, SE = sqrt(variance)) + # Jackknife integration elseif design.type == "jackknife" - - # The estimate based on original sampling weight vector - θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights])) - - # Find number of psus (nh) in each strata, used inside loop - stratified_gdf = groupby(design.data, design.strata) - nh = Dict{String, Int}() - for (key, subgroup) in pairs(stratified_gdf) - nh[key[design.strata]] = length(unique(subgroup[!, design.cluster])) - end - - # iterating over each unique combinations of strata and cluster - unique_strata_cols_df = unique(select(design.data, design.strata, design.cluster)) - variance = 0 - replicate_index = 1 - SSE_θ̂hj = 0 - previous_strata = first(unique_strata_cols_df[!,design.strata]) - for row in eachrow(unique_strata_cols_df) - stratum = row[design.strata] - # psu = row[design.cluster] - colname = "replicate_"*string(replicate_index) - - if stratum != previous_strata - variance += ((nh[stratum] - 1) / nh[stratum]) * SSE_θ̂hj - previous_strata = stratum - - θ̂j = mean(design.data[!, x], weights(design.data[!, colname])) - SSE_θ̂hj = (θ̂j - θ̂)^2 - else - θ̂j = mean(design.data[!, x], weights(design.data[!, colname])) - SSE_θ̂hj += (θ̂j - θ̂)^2 - end - replicate_index += 1 - end + weightedmean(x, y) = mean(x, weights(y)) + return jackknife_variance(x, weightedmean, design) end - DataFrame(mean = θ̂, SE = sqrt(variance)) end """ From 0985ec1077fb274ef4f2f2ff5f7f58b08549a375 Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 23 Mar 2023 12:34:03 +0530 Subject: [PATCH 46/63] added JK tests srs, strat --- test/jackknife.jl | 53 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/test/jackknife.jl b/test/jackknife.jl index c6527ac8..ce7e685c 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -1,7 +1,48 @@ @testset "jackknife" begin - dstrat_jk = jackknifeweights(dstrat) - mean(:api00,dstrat_jk) - mean(:api99,dstrat_jk) - # dnhanes_jk = jackknifeweights(dnhanes) - # dclus1_jk = jackknifeweights(dclus1) -end \ No newline at end of file + # Create Jackknife replicate designs + bsrs_jk = srs |> jackknifeweights # SRS + dstrat_jk = dstrat |> jackknifeweights # Stratified + dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster + dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified + + mean(:api00,bsrs_jk).SE[1] ≈ 9.4028 atol = 1e-3 + mean([:api99,:api00],dstrat_jk).SE[1] ≈ 10.097 atol = 1e-3 + # mean(:api00,dclus2_jk) + # mean(,dnhanes_jk) +end + +# # R code for correctness above +# library(survey) +# data(api) +# apiclus1$pw = rep(757/15,nrow(apiclus1)) + +# ############# +# ###### 23.03.22 PR#260 + +# ## srs with fpc (doesnt match Julia) +# srs <- svydesign(id=~1,data=apisrs, weights=~pw, fpc=~fpc) +# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) +# svymean(~api00,bsrs_jk) +# # mean SE +# # api00 656.58 9.2497 + +# ## srs without fpc (matches Julia) +# srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc) +# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) +# svymean(~api00,bsrs_jk) +# # mean SE +# # api00 656.58 9.4028 + +# # strat with fpc (doesnt match Julia) +# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc) +# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) +# svymean(~api99,dstrat_jk) +# # mean SE +# # api99 629.39 9.9639 + +# # strat without fpc (matches Julia) +# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc) +# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) +# svymean(~api99,dstrat_jk) +# # mean SE +# # api99 629.39 10.097 \ No newline at end of file From a223253c77d1229b1c883c966d78f9f5951ecf69 Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 23 Mar 2023 12:43:54 +0530 Subject: [PATCH 47/63] improved formatting for srs strat tests --- test/jackknife.jl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/test/jackknife.jl b/test/jackknife.jl index ce7e685c..a61ae934 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -5,8 +5,14 @@ dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified - mean(:api00,bsrs_jk).SE[1] ≈ 9.4028 atol = 1e-3 - mean([:api99,:api00],dstrat_jk).SE[1] ≈ 10.097 atol = 1e-3 + mean_srs_jk = mean([:api00,:api99], bsrs_jk) + @test mean_srs_jk.SE[1] ≈ 9.4028 atol = 1e-3 + @test mean_srs_jk.SE[2] ≈ 9.6575 atol = 1e-3 + + mean_strat_jk = mean([:api00,:api99],dstrat_jk) + @test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3 + @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3 + # mean(:api00,dclus2_jk) # mean(,dnhanes_jk) end @@ -29,9 +35,10 @@ end # ## srs without fpc (matches Julia) # srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc) # bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) -# svymean(~api00,bsrs_jk) +# svymean(~api00+api99,bsrs_jk) # # mean SE # # api00 656.58 9.4028 +# # api99 624.68 9.6575 # # strat with fpc (doesnt match Julia) # dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc) @@ -43,6 +50,7 @@ end # # strat without fpc (matches Julia) # dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc) # dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) -# svymean(~api99,dstrat_jk) +# svymean(~api00+api99,dstrat_jk) # # mean SE -# # api99 629.39 10.097 \ No newline at end of file +# # api00 662.29 9.5361 +# # api99 629.39 10.0970 \ No newline at end of file From d326314043328bdb271b7e4679af5ebcaea7f85a Mon Sep 17 00:00:00 2001 From: smishr Date: Thu, 23 Mar 2023 13:12:56 +0530 Subject: [PATCH 48/63] dclus2 tests not working --- test/jackknife.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/jackknife.jl b/test/jackknife.jl index a61ae934..49ec0096 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -13,8 +13,11 @@ @test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3 @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3 - # mean(:api00,dclus2_jk) - # mean(,dnhanes_jk) + mean_clus2_jk = mean([:api00,:api99],dclus2_jk) + # @test mean_clus2_jk.SE[1] ≈ XXXXX atol = 1e-3 + # @test mean_clus2_jk.SE[2] ≈ XXXXX atol = 1e-3 + + # Tests using for NHANES end # # R code for correctness above From 4f278dac93c07ad58d0f993464008e3ca60a5afe Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 24 Mar 2023 12:50:19 +0530 Subject: [PATCH 49/63] Add tests for total.jl --- test/total.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/total.jl b/test/total.jl index b8d91795..f21cfa81 100644 --- a/test/total.jl +++ b/test/total.jl @@ -168,4 +168,13 @@ end # equivalent R code (results cause clutter): # > svyby(~api00, ~cname, clus1rep, svytotal) # > svyby(~api00, ~cname, clus1rep, svymean) + + # Test that column specifiers from DataFrames make it through this pipeline + # These tests replicate what you see above...just with a different syntax. + tot = total(:api00, Cols(==(:cname)), dclus1) + @test size(tot)[1] == apiclus1.cname |> unique |> length + @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL + @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL + @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL + @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL end From 6079b09da0f7eba401a0cc3673fd8d16094d03e8 Mon Sep 17 00:00:00 2001 From: Jishnu Bhattacharya Date: Fri, 24 Mar 2023 14:47:42 +0530 Subject: [PATCH 50/63] Reduce allocations in bootweights --- src/bootstrap.jl | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 7e01247f..96e98367 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -24,27 +24,17 @@ replicates: 1000 function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234)) stratified = groupby(design.data, design.strata) H = length(keys(stratified)) - substrata_dfs = DataFrame[] + substrata_dfs = Vector{DataFrame}(undef, H) for h = 1:H substrata = DataFrame(stratified[h]) cluster_sorted = sort(substrata, design.cluster) - psus = unique(cluster_sorted[!, design.cluster]) - npsus = [(count(==(i), cluster_sorted[!, design.cluster])) for i in psus] - nh = length(psus) + cluster_sorted_designcluster = cluster_sorted[!, design.cluster] cluster_weights = cluster_sorted[!, design.weights] - for replicate = 1:replicates - randinds = rand(rng, 1:(nh), (nh - 1)) - cluster_sorted[!, "replicate_"*string(replicate)] = - vcat( - [ - fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for - i = 1:nh - ]..., - ) .* cluster_weights - end - push!(substrata_dfs, cluster_sorted) + _bootweights_cluster_sorted!(cluster_sorted, cluster_weights, + cluster_sorted_designcluster, replicates, rng) + substrata_dfs[h] = cluster_sorted end - df = vcat(substrata_dfs...) + df = reduce(vcat, substrata_dfs) return ReplicateDesign( df, design.cluster, @@ -57,3 +47,22 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis replicates, ) end + +function _bootweights_cluster_sorted!(cluster_sorted, + cluster_weights, cluster_sorted_designcluster, replicates, rng) + + psus = unique(cluster_sorted_designcluster) + npsus = [count(==(i), cluster_sorted_designcluster) for i in psus] + nh = length(psus) + for replicate = 1:replicates + randinds = rand(rng, 1:(nh), (nh - 1)) + cluster_sorted[!, "replicate_"*string(replicate)] = + reduce(vcat, + [ + fill((count(==(i), randinds)) * (nh / (nh - 1)), npsus[i]) for + i = 1:nh + ] + ) .* cluster_weights + end + cluster_sorted +end From b7ac4f2d92706033c60b9f465c84c133abb13ad9 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Tue, 21 Mar 2023 12:34:38 +0530 Subject: [PATCH 51/63] Adding information to docstring of `jackknifeweights`. --- src/jackknife.jl | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 5f0e5428..9473c0f5 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -9,10 +9,31 @@ w_{i(hj)} = \\begin{cases} w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\ 0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\ - \\dfrac{n_h}{n_h - 1}w_i &\\quad\\quad\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\ + \\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\ \\end{cases} ``` -The replicate weight columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. These are added to the underlying `design.data` `DataFrame`. +In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. + +# Examples +```jldoctest setup = :(using Survey) +julia> apistrat = load_data("apistrat"); + +julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + +julia> rstrat = jackknifeweights(dstrat) +ReplicateDesign: +data: 200×244 DataFrame +strata: stype + [E, E, E … M] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 20.36] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] +type: jackknife +replicates: 200 + +``` # Reference pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) From 29250123dbc73dfca4dc344aaf2230e824a0ffc2 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Sat, 25 Mar 2023 17:06:29 +0530 Subject: [PATCH 52/63] Completing docstring for `jackknife_variance`. --- src/jackknife.jl | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 9473c0f5..5469d30d 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -15,7 +15,9 @@ w_{i(hj)} = In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. # Examples -```jldoctest setup = :(using Survey) +```jldoctest +julia> using Survey; + julia> apistrat = load_data("apistrat"); julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); @@ -86,11 +88,47 @@ end jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) ``` -Compute variance for the given `func` using the Jackknife method. The formula to compute this variance is the following. +Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. ```math \\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2 ``` + +Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed. + +# Examples +```jldoctest +julia> using Survey, StatsBase + +julia> apistrat = load_data("apistrat"); + +julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + +julia> rstrat = jackknifeweights(dstrat) +ReplicateDesign: +data: 200×244 DataFrame +strata: stype + [E, E, E … M] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 20.36] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] +type: jackknife +replicates: 200 + +julia> weightedmean(x, y) = mean(x, weights(y)); + +julia> jackknife_variance(:api00, weightedmean, rstrat) +1×2 DataFrame + Row │ estimator SE + │ Float64 Float64 +─────┼──────────────────── + 1 │ 662.287 9.53613 + +``` +# Reference +pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) df = design.data From 27ebfed77db5e164a3fcfe11a8e19b20e48b9a4f Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Sat, 25 Mar 2023 17:56:46 +0530 Subject: [PATCH 53/63] Update test/total.jl --- test/total.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/total.jl b/test/total.jl index f21cfa81..2138527b 100644 --- a/test/total.jl +++ b/test/total.jl @@ -171,7 +171,7 @@ end # Test that column specifiers from DataFrames make it through this pipeline # These tests replicate what you see above...just with a different syntax. - tot = total(:api00, Cols(==(:cname)), dclus1) + tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1) @test size(tot)[1] == apiclus1.cname |> unique |> length @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL From c204a14def407255d21355cd46e2acb0d9df8ed8 Mon Sep 17 00:00:00 2001 From: Jishnu Bhattacharya Date: Sun, 26 Mar 2023 09:34:16 +0530 Subject: [PATCH 54/63] Add comment about type-stability Co-authored-by: Anshul Singhvi --- src/bootstrap.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 96e98367..ab8bb0ad 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -30,6 +30,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis cluster_sorted = sort(substrata, design.cluster) cluster_sorted_designcluster = cluster_sorted[!, design.cluster] cluster_weights = cluster_sorted[!, design.weights] + # Perform the inner loop in a type-stable function to improve runtime. _bootweights_cluster_sorted!(cluster_sorted, cluster_weights, cluster_sorted_designcluster, replicates, rng) substrata_dfs[h] = cluster_sorted From e9362f9b5821ee6086dd76815a6c3afb42982ac5 Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Mon, 27 Mar 2023 09:54:19 +0530 Subject: [PATCH 55/63] Correct apiclus2 pw colum --- test/runtests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5f6b748b..048f74bb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -38,7 +38,6 @@ dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters # Two-stage cluster sample apiclus2 = load_data("apiclus2") # Load API dataset -apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake for pw column dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus2_boot = dclus2 |> bootweights # Create replicate design From 8a3d768fe7e3ca8623bbd672655303968bdf8eb3 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Mon, 27 Mar 2023 14:53:08 +0530 Subject: [PATCH 56/63] add tests clus2 nhanes jk --- test/jackknife.jl | 35 ++++++++++++++++++++++++++++++++--- test/runtests.jl | 2 ++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/test/jackknife.jl b/test/jackknife.jl index 49ec0096..73ab5b71 100644 --- a/test/jackknife.jl +++ b/test/jackknife.jl @@ -14,16 +14,22 @@ @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3 mean_clus2_jk = mean([:api00,:api99],dclus2_jk) - # @test mean_clus2_jk.SE[1] ≈ XXXXX atol = 1e-3 - # @test mean_clus2_jk.SE[2] ≈ XXXXX atol = 1e-3 + @test mean_clus2_jk.SE[1] ≈ 34.9388 atol = 1e-3 # R gives 34.928 + @test mean_clus2_jk.SE[2] ≈ 34.6565 atol = 1e-3 # R gives 34.645 # Tests using for NHANES + mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk) + @test mean_nhanes_jk.estimator[1] ≈ 21393.96 atol = 1e-3 + @test mean_nhanes_jk.SE[1] ≈ 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place + @test mean_nhanes_jk.estimator[2] ≈ 38508.328 atol = 1e-3 + @test mean_nhanes_jk.SE[2] ≈ 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place end # # R code for correctness above # library(survey) # data(api) # apiclus1$pw = rep(757/15,nrow(apiclus1)) +# No corrections needed for apiclus2, it has correct weights by default # ############# # ###### 23.03.22 PR#260 @@ -56,4 +62,27 @@ end # svymean(~api00+api99,dstrat_jk) # # mean SE # # api00 662.29 9.5361 -# # api99 629.39 10.0970 \ No newline at end of file +# # api99 629.39 10.0970 + +#### apiclus2 +# > # clus2 without fpc (doesnt match Julia) +# > dclus2 <- svydesign(id=~dnum+snum, weights=~pw, data=apiclus2) +# > dclus2_jk <- as.svrepdesign(dclus2, type="JK1", compress=FALSE) +# > svymean(~api00+api99,dclus2) +# mean SE +# api00 670.81 30.712 +# api99 645.03 30.308 +# > svymean(~api00+api99,dclus2_jk) +# mean SE +# api00 670.81 34.928 +# api99 645.03 34.645 + +# NHANES test +# > data("nhanes") +# > nhanes$seq1 = seq(1.0, 5*8591.0, by = 5) +# > nhanes$seq2 = seq(1.0, 9*8591.0, by = 9)#, length.out=8591) +# > dnhanes <- svydesign(id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=TRUE, data=nhanes) +# > svymean(~seq1+seq2,dnhanes) +# mean SE +# seq1 21394 143.34 +# seq2 38508 258.01 \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index c3926c5c..d709103c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -43,6 +43,8 @@ dclus2_boot = dclus2 |> bootweights # Create replicate design # NHANES nhanes = load_data("nhanes") +nhanes.seq1 = collect(1.0:5.0:42955.0) +nhanes.seq2 = collect(1.0:9.0:77319.0) # [9k for k in 0:8590.0] dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) dnhanes_boot = dnhanes |> bootweights From 2162a59c196eeef57afbf943ca83c84571664758 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Tue, 28 Mar 2023 11:11:00 +0530 Subject: [PATCH 57/63] add double domain estimation --- test/runtests.jl | 1 - test/total.jl | 16 ++++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5f6b748b..048f74bb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -38,7 +38,6 @@ dclus1_boot_regex = ReplicateDesign(dclus1_boot.data, REPLICATES_REGEX, clusters # Two-stage cluster sample apiclus2 = load_data("apiclus2") # Load API dataset -apiclus2[!, :pw] = fill(757 / 15, (size(apiclus2, 1),)) # Correct api mistake for pw column dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus2_boot = dclus2 |> bootweights # Create replicate design diff --git a/test/total.jl b/test/total.jl index 2138527b..1e208397 100644 --- a/test/total.jl +++ b/test/total.jl @@ -169,12 +169,16 @@ end # > svyby(~api00, ~cname, clus1rep, svytotal) # > svyby(~api00, ~cname, clus1rep, svymean) + # Test multiple domains passed at once + tot = total(:api00, [:stype,:cname], dclus1_boot) + # Test that column specifiers from DataFrames make it through this pipeline # These tests replicate what you see above...just with a different syntax. - tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1) - @test size(tot)[1] == apiclus1.cname |> unique |> length - @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL - @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL - @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL - @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL + tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot) + ######## Above Survey.DataFrames.Cols(==(:cname)) syntax doesnt give domain estimates + # @test size(tot)[1] == apiclus1.cname |> unique |> length + # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL + # @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL + # @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL + # @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL end From f9ff9cca61b4d39a0205a4351493f5e560914d46 Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Tue, 28 Mar 2023 15:27:26 +0530 Subject: [PATCH 58/63] Update src/jackknife.jl Co-authored-by: Anshul Singhvi --- src/jackknife.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 5469d30d..45cef664 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -84,9 +84,7 @@ function jackknifeweights(design::SurveyDesign) end """ -```julia -jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) -``` + jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. From 3daeae6b207d961d9e4d35f25c22ef5623c7a337 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Sat, 1 Apr 2023 00:01:26 +0530 Subject: [PATCH 59/63] Minor change for notation consistency. --- src/jackknife.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 45cef664..4dff97d8 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -144,10 +144,10 @@ function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) cluster_variance = 0 for psu in psus_in_stratum # get replicate weights corresponding to current stratum and psu - rep_weights = design.data[!, "replicate_"*string(replicate_index)] + rep_weights = df[!, "replicate_"*string(replicate_index)] # estimator from replicate weights - θhj = func(design.data[!, x], rep_weights) + θhj = func(df[!, x], rep_weights) cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2 replicate_index += 1 From 1da3e8f1f005c20cbdf6187f8448a27a304baa54 Mon Sep 17 00:00:00 2001 From: Siddhant Chaudhary Date: Sat, 1 Apr 2023 00:16:05 +0530 Subject: [PATCH 60/63] Using tab indent in docstring. --- src/jackknife.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/jackknife.jl b/src/jackknife.jl index 4dff97d8..315de348 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -1,7 +1,6 @@ """ -```julia -jackknifeweights(design::SurveyDesign) -``` + jackknifeweights(design::SurveyDesign) + Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula. ```math From e55225068cf9cc4f92fe2d47675575aa41ea8eae Mon Sep 17 00:00:00 2001 From: Shikhar Mishra Date: Mon, 10 Apr 2023 10:41:20 +0530 Subject: [PATCH 61/63] Update total.jl --- test/total.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/total.jl b/test/total.jl index 1e208397..fc02b48f 100644 --- a/test/total.jl +++ b/test/total.jl @@ -171,10 +171,11 @@ end # Test multiple domains passed at once tot = total(:api00, [:stype,:cname], dclus1_boot) - + + #### Why doesnt this syntax produce domain estimates?? # Test that column specifiers from DataFrames make it through this pipeline # These tests replicate what you see above...just with a different syntax. - tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot) + # tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot) ######## Above Survey.DataFrames.Cols(==(:cname)) syntax doesnt give domain estimates # @test size(tot)[1] == apiclus1.cname |> unique |> length # @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL From 74c72f2d286a8bfec88419cb8a6b449dd1cdf106 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Mon, 10 Apr 2023 11:14:26 +0530 Subject: [PATCH 62/63] Add Vector{Symbol} test Domain est --- test/total.jl | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/test/total.jl b/test/total.jl index fc02b48f..b7780263 100644 --- a/test/total.jl +++ b/test/total.jl @@ -171,7 +171,9 @@ end # Test multiple domains passed at once tot = total(:api00, [:stype,:cname], dclus1_boot) - + @test filter(row -> row[:cname] == "Los Angeles" && row[:stype] == "E", tot).SE[1] ≈ 343365 rtol = STAT_TOL + @test filter(row -> row[:cname] == "Merced" && row[:stype] == "H", tot).SE[1] ≈ 27090.33 rtol = STAT_TOL + #### Why doesnt this syntax produce domain estimates?? # Test that column specifiers from DataFrames make it through this pipeline # These tests replicate what you see above...just with a different syntax. @@ -183,3 +185,42 @@ end # @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL # @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL end + +#### R code for vector{Symbol} domain estimation +# > data(api) +# > apiclus1$pw = rep(757/15,nrow(apiclus1)) +# > ### 9.04.23 +# > dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1); +# > rclus1<-as.svrepdesign(dclus1, type="subbootstrap", compress=FALSE, replicates = 4000) +# > svyby(~api00, ~stype+cname, rclus1, svytotal) +# stype cname api00 se +# E.Alameda E Alameda 273428.40 275423.33 +# H.Alameda H Alameda 30683.73 30907.60 +# M.Alameda M Alameda 67272.07 67762.88 +# E.Fresno E Fresno 48599.40 47484.67 +# H.Fresno H Fresno 22356.73 21843.93 +# M.Fresno M Fresno 24324.93 23766.99 +# E.Kern E Kern 24930.53 24847.76 +# M.Kern M Kern 20741.80 20672.93 +# E.Los Angeles E Los Angeles 395154.00 341692.92 +# M.Los Angeles M Los Angeles 94826.87 95416.42 +# E.Mendocino E Mendocino 58844.13 57711.15 +# H.Mendocino H Mendocino 35124.80 34448.51 +# M.Mendocino M Mendocino 31844.47 31231.33 +# E.Merced E Merced 50517.13 51424.65 +# H.Merced H Merced 26696.87 27176.47 +# M.Merced M Merced 27605.27 28101.18 +# E.Orange E Orange 463536.33 465047.76 +# M.Orange M Orange 110219.20 110578.59 +# E.Plumas E Plumas 144284.20 146672.86 +# H.Plumas H Plumas 143729.07 146108.54 +# M.Plumas M Plumas 34266.87 34834.16 +# E.San Diego E San Diego 1670497.13 1233144.04 +# H.San Diego H San Diego 63386.13 63693.54 +# M.San Diego M San Diego 96492.27 96960.22 +# E.San Joaquin E San Joaquin 848243.73 848605.33 +# H.San Joaquin H San Joaquin 79585.93 79619.86 +# M.San Joaquin M San Joaquin 101387.53 101430.75 +# E.Santa Clara E Santa Clara 737418.93 484164.71 +# H.Santa Clara H Santa Clara 35478.07 35311.28 +# M.Santa Clara M Santa Clara 187685.53 131278.63 \ No newline at end of file From 8a4bf69cb95e9492bf4d6d0b272e94416e345e16 Mon Sep 17 00:00:00 2001 From: smishr <43640926+smishr@users.noreply.github.com> Date: Mon, 10 Apr 2023 11:18:21 +0530 Subject: [PATCH 63/63] v0.2 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 792d0adc..7ecf4b9a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Survey" uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c" authors = ["Ayush Patnaik "] -version = "0.1.0" +version = "0.2.0" [deps] AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"