Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Hartley Rao and other improvements #247

Draft
wants to merge 19 commits into
base: next_release
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand Down
2 changes: 2 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Presently, summary statistics such as `mean`, `total`, `ratio`, and `quantile` c

The package is built on top of [DataFrames.jl](https://dataframes.juliadata.org/stable/) and supports a variety of features for data manipulation. Plots are generated using [AlgebraOfGraphics](https://github.com/MakieOrg/AlgebraOfGraphics.jl).

Discourse [post](https://discourse.julialang.org/t/ann-announcing-survey-jl-for-analysis-of-complex-surveys/94667) announcing the package.

## Plans
We plan for efficient implementations of all the methods in R `survey`. Features for future releases will include:

Expand Down
5 changes: 5 additions & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ using AlgebraOfGraphics
using CategoricalArrays
using Random
using Missings
using RData
using Downloads

include("SurveyDesign.jl")
include("bootstrap.jl")
Expand All @@ -26,6 +28,8 @@ include("show.jl")
include("ratio.jl")
include("by.jl")
include("jackknife.jl")
include("ht.jl")
include("analytical.jl")

export load_data
export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
Expand All @@ -37,5 +41,6 @@ export boxplot
export bootweights
export ratio
export jackknifeweights, jackknife_variance
export HartleyRao

end
18 changes: 18 additions & 0 deletions src/analytical.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
Variance under Simple Random Sampling scheme
see Lohr pg78.
"""
function srs_variance(x::Symbol, design::AbstractSurveyDesign)
ŝ = var(design.data[!,x]) ./ nrow(design.data)
X̂ = wsum(design.data[!, x], weights(design.data[!, design.weights]))
DataFrame(ht_total = X̂)
end

"""
Strata totals
"""
function strata_totals(x::Symbol, design::AbstractSurveyDesign)
gdf_strata = groupby(design.data, design.strata)
X̂ₛ = combine(gdf_strata, [x, design.weights] => ((a, b) -> wsum(a, b)) => :total)
return X̂ₛ
end
28 changes: 28 additions & 0 deletions src/ht.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator.
Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme.
"""
function HartleyRao(x::Symbol, design::SurveyDesign, HT_total)
y = design.data[!,x]
πᵢ = design.data[!,design.allprobs]
hartley_rao_var = sum((1 .- ((design.data[!,design.sampsize] .- 1) ./ design.data[!,design.sampsize]) .* πᵢ) .*
((y .* design.data[!,design.weights]) .- (HT_total ./ design.data[!,design.sampsize])) .^ 2)
return hartley_rao_var
end

"""
HorvitzThompsonTotal(x, design)

Horvitz-Thompson total
"""
function HorvitzThompsonTotal(x::Symbol, design::SurveyDesign)
X̂ₕₜ = wsum(design.data[!, x], weights(design.data[!, design.weights]))
var = HartleyRao(x, design, X̂ₕₜ)
DataFrame(total = X̂ₕₜ, var = var)
end

function HorvitzThompsonTotal(x::Vector{Symbol}, design::SurveyDesign)
df = reduce(vcat, [HorvitzThompsonTotal(i, design) for i in x])
insertcols!(df, 1, :names => String.(x))
return df
end
5 changes: 3 additions & 2 deletions src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ julia> total(:api00, dclus1)
```
"""
function total(x::Symbol, design::SurveyDesign)
X = wsum(design.data[!, x], weights(design.data[!, design.weights]))
DataFrame(total = X)
X̂ = wsum(design.data[!, x], weights(design.data[!, design.weights]))
variance = HartleyRao(x, design, X̂)
DataFrame(total = X̂, SE = sqrt(variance))
end

"""
Expand Down
2 changes: 2 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
3 changes: 3 additions & 0 deletions test/analytical.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# @testset "analytical_ht_total" begin

# end
12 changes: 12 additions & 0 deletions test/ht.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
@testset "hartley_rao" begin
### I havent been able to reproduce below in R survey, which give somewhat different SE estimates
# base functionality SRS
tot = total(:api00, srs)
@test tot.SE[1] ≈ 57154.1 rtol = STAT_TOL
# Stratified
tot = total(:api00, dstrat)
@test tot.SE[1] ≈ 699405 rtol = STAT_TOL
# one stage cluster
tot = total(:api00, dclus1)
@test tot.SE[1] ≈ 4880240 rtol = STAT_TOL
end
7 changes: 6 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ apiclus2 = load_data("apiclus2") # Load API dataset
dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
dclus2_boot = dclus2 |> bootweights # Create replicate design

# download and test using popular multistage stratified surveys
## TODO

# NHANES
nhanes = load_data("nhanes")
nhanes.seq1 = collect(1.0:5.0:42955.0)
Expand All @@ -63,4 +66,6 @@ include("hist.jl")
include("boxplot.jl")
include("ratio.jl")
include("show.jl")
include("jackknife.jl")
include("jackknife.jl")
include("analytical.jl")
include("ht.jl")