diff --git a/Project.toml b/Project.toml
index 2d24fe2f..09562ba5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Alberto Mercurio"]
 version = "0.8.2"
 
 [deps]
+ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
@@ -25,6 +26,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 QuantumToolboxCUDAExt = "CUDA"
 
 [compat]
+ArrayInterface = "6, 7"
 CUDA = "5"
 DiffEqCallbacks = "2, 3"
 FFTW = "1.5"
diff --git a/src/QuantumToolbox.jl b/src/QuantumToolbox.jl
index 4790d42e..246d19d1 100644
--- a/src/QuantumToolbox.jl
+++ b/src/QuantumToolbox.jl
@@ -22,6 +22,7 @@ else
 end
 
 # other dependencies (in alphabetical order)
+import ArrayInterface: allowed_getindex, allowed_setindex!
 import DiffEqCallbacks: DiscreteCallback, PeriodicCallback, PresetTimeCallback
 import FFTW: fft, fftshift
 import Graphs: connected_components, DiGraph
diff --git a/src/general_functions.jl b/src/general_functions.jl
index 61ac28d7..3178dc76 100644
--- a/src/general_functions.jl
+++ b/src/general_functions.jl
@@ -442,3 +442,6 @@ Convert a quantum object from matrix ([`OperatorQuantumObject`](@ref)-type) to v
 """
 mat2vec(A::QuantumObject{<:AbstractArray{T},OperatorQuantumObject}) where {T} =
     QuantumObject(mat2vec(A.data), OperatorKet, A.dims)
+
+_get_dense_similar(A::AbstractArray, args...) = similar(A, args...)
+_get_dense_similar(A::AbstractSparseMatrix, args...) = similar(nonzeros(A), args...)
diff --git a/src/steadystate.jl b/src/steadystate.jl
index 5ae6400a..e26283f9 100644
--- a/src/steadystate.jl
+++ b/src/steadystate.jl
@@ -26,15 +26,25 @@ function _steadystate(
     L::QuantumObject{<:AbstractArray{T},SuperOperatorQuantumObject},
     solver::SteadyStateDirectSolver,
 ) where {T}
-    L_tmp = copy(L.data)
+    L_tmp = L.data
     N = prod(L.dims)
     weight = norm(L_tmp, 1) / length(L_tmp)
-    v0 = zeros(ComplexF64, N^2) # This is not scalable for GPU arrays
-    v0[1] = weight
 
-    L_tmp[1, [N * (i - 1) + i for i in 1:N]] .+= weight
+    v0 = _get_dense_similar(L_tmp, N^2)
+    fill!(v0, 0)
+    allowed_setindex!(v0, weight, 1) # Because scalar indexing is not allowed on GPU arrays
 
-    ρss_vec = L_tmp \ v0
+    idx_range = collect(1:N)
+    rows = _get_dense_similar(L_tmp, N)
+    cols = _get_dense_similar(L_tmp, N)
+    datas = _get_dense_similar(L_tmp, N)
+    fill!(rows, 1)
+    copyto!(cols, N .* (idx_range .- 1) .+ idx_range)
+    fill!(datas, weight)
+    Tn = sparse(rows, cols, datas, N^2, N^2)
+    L_tmp = L_tmp + Tn
+
+    ρss_vec = L_tmp \ v0 # This is still not supported on GPU, yet
     ρss = reshape(ρss_vec, N, N)
     ρss = (ρss + ρss') / 2 # Hermitianize
     return QuantumObject(ρss, Operator, L.dims)