Skip to content

Commit

Permalink
Allow Kinds to be registered by packages outside JuliaSyntax (#461)
Browse files Browse the repository at this point in the history
Extensible kinds are quite tricky. We want
* To use a small number of bits for them
* To have the string representation in the source, but have the compiler
  able to fully inline the integer representation.
* Allow modules with different kinds to cooperate together on the same
  integer representation.
* Not trigger invalidation when new kinds are added
* Different `Kind` modules to not require cooperation

This is a very hard set of constraints to satisfy. The last one is
already impossible in a single flat namespace so in this design we've
given up on it and require cooperation between all kind extension
modules, including module authors allocating non-colliding id's for
their modules, in addition to non-colliding kind names.
  • Loading branch information
c42f authored Jul 18, 2024
1 parent a63e8bb commit f99b76f
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 129 deletions.
319 changes: 190 additions & 129 deletions src/kinds.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,194 @@
# Definition of Kind type - mapping from token string identifiers to
# enumeration values as used in @K_str
const _kind_names =
[

"""
K"name"
Kind(namestr)
`Kind` is a type tag for specifying the type of tokens and interior nodes of
a syntax tree. Abstractly, this tag is used to define our own *sum types* for
syntax tree nodes. We do this explicitly outside the Julia type system because
(a) Julia doesn't have sum types and (b) we want concrete data structures which
are unityped from the Julia compiler's point of view, for efficiency.
Naming rules:
* Kinds which correspond to exactly one textural form are represented with that
text. This includes keywords like K"for" and operators like K"*".
* Kinds which represent many textural forms have UpperCamelCase names. This
includes kinds like K"Identifier" and K"Comment".
* Kinds which exist merely as delimiters are all uppercase
"""
primitive type Kind 16 end

# The implementation of Kind here is basically similar to @enum. However we use
# the K_str macro to self-name these kinds with their literal representation,
# rather than needing to invent a new name for each.

const _kind_str_to_int = Dict{String,UInt16}()
const _kind_int_to_str = Dict{UInt16,String}()
const _kind_modules = Dict{Int,Union{Symbol,Module}}(
0=>:JuliaSyntax,
1=>:JuliaLowering,
2=>:JuliaSyntaxFormatter
)
# Number of bits reserved for kind id's belonging to a single module
const _kind_nbits = 10
const _kind_module_id_max = typemax(UInt16) >> _kind_nbits

function Kind(x::Integer)
if x < 0 || x > typemax(UInt16)
throw(ArgumentError("Kind out of range: $x"))
end
return Base.bitcast(Kind, convert(UInt16, x))
end

function Base.convert(::Type{String}, k::Kind)
_kind_int_to_str[reinterpret(UInt16, k)]
end

function Base.convert(::Type{Kind}, s::AbstractString)
i = get(_kind_str_to_int, s) do
error("unknown Kind name $(repr(s))")
end
Kind(i)
end

Base.string(x::Kind) = convert(String, x)
Base.print(io::IO, x::Kind) = print(io, convert(String, x))

Base.isless(x::Kind, y::Kind) = reinterpret(UInt16, x) < reinterpret(UInt16, y)

function Base.show(io::IO, k::Kind)
print(io, "K\"$(convert(String, k))\"")
end

# Save the string representation rather than the bit pattern so that kinds
# can be serialized and deserialized across different JuliaSyntax versions.
function Base.write(io::IO, k::Kind)
str = convert(String, k)
write(io, UInt8(length(str))) + write(io, str)
end
function Base.read(io::IO, ::Type{Kind})
len = read(io, UInt8)
str = String(read(io, len))
convert(Kind, str)
end

function Base.parentmodule(k::Kind)
mod_id = reinterpret(UInt16, k) >> _kind_nbits
_kind_modules[mod_id]::Module
end

function _register_kinds!(kind_modules, int_to_kindstr, kind_str_to_int, mod, module_id, names)
if module_id > _kind_module_id_max
error("Kind module id $module_id is out of range")
elseif length(names) >= 1 << _kind_nbits
error("Too many kind names")
elseif !haskey(kind_modules, module_id)
kind_modules[module_id] = mod
else
m = kind_modules[module_id]
if m == nameof(mod)
# Ok: known kind module, but not loaded until now
kind_modules[module_id] = mod
elseif m == mod
existing_kinds = [(i = get(kind_str_to_int, n, nothing);
isnothing(i) ? nothing : Kind(i)) for n in names]
if any(isnothing, existing_kinds) ||
!issorted(existing_kinds) ||
any(k->parentmodule(k) != mod, existing_kinds)
error("Error registering kinds for module $mod (register_kinds() called more than once inconsistently, or conflict with existing module kinds?)")
else
# Assume we're re-registering kinds as in top level vs `__init__`
return
end
else
error("Kind module ID $module_id already claimed by module $m")
end
end
# Process names to conflate category BEGIN/END markers with the first/last
# in the category.
i = 0
for name in names
normal_kind = false
if startswith(name, "BEGIN_")
j = i
elseif startswith(name, "END_")
j = i - 1
else
normal_kind = true
j = i
i += 1
end
kind_int = (module_id << _kind_nbits) | j
push!(kind_str_to_int, name=>kind_int)
if normal_kind
push!(int_to_kindstr, kind_int=>name)
end
end
end

"""
register_kinds!(mod, module_id, names)
Register custom `Kind`s with the given `names`, belonging to a module `mod`.
`names` is an array of arbitrary strings.
In order for kinds to be represented by a small number of bits, some nontrivial
cooperation is reqired between modules using custom kinds:
* The integer `module_id` is globally unique for each `mod` which will be used
together, and not larger than $_kind_module_id_max.
* No two modules register the same `name`. The semantics of a given `kind` name
should be defined by the module which owns it.
To allow ranges of kinds to be delimited and quickly tested for, some special
names are allowed: `BEGIN_section` and `END_section` pairs are detected, and
alias the next and previous kind id's respectively so that kinds in `section`
can be tested with `BEGIN_section <= k <= END_section`.
"""
function register_kinds!(mod, module_id, names)
_register_kinds!(_kind_modules, _kind_int_to_str, _kind_str_to_int, mod, module_id, names)
end

#-------------------------------------------------------------------------------

"""
K"s"
The kind of a token or AST internal node with string "s".
For example
* K")" is the kind of the right parenthesis token
* K"block" is the kind of a block of code (eg, statements within a begin-end).
"""
macro K_str(s)
convert(Kind, s)
end

"""
A set of kinds which can be used with the `in` operator. For example
k in KSet"+ - *"
"""
macro KSet_str(str)
kinds = [convert(Kind, s) for s in split(str)]

quote
($(kinds...),)
end
end

"""
kind(x)
Return the `Kind` of `x`.
"""
kind(k::Kind) = k


#-------------------------------------------------------------------------------
# Kinds used by JuliaSyntax
register_kinds!(JuliaSyntax, 0, [
"None" # Placeholder; never emitted by lexer
"EndMarker" # EOF
"Comment"
Expand Down Expand Up @@ -918,133 +1105,7 @@ const _kind_names =
# Container for a single statement/atom plus any trivia and errors
"wrapper"
"END_SYNTAX_KINDS"
]

"""
K"name"
Kind(id)
`Kind` is a type tag for specifying the type of tokens and interior nodes of
a syntax tree. Abstractly, this tag is used to define our own *sum types* for
syntax tree nodes. We do this explicitly outside the Julia type system because
(a) Julia doesn't have sum types and (b) we want concrete data structures which
are unityped from the Julia compiler's point of view, for efficiency.
Naming rules:
* Kinds which correspond to exactly one textural form are represented with that
text. This includes keywords like K"for" and operators like K"*".
* Kinds which represent many textural forms have UpperCamelCase names. This
includes kinds like K"Identifier" and K"Comment".
* Kinds which exist merely as delimiters are all uppercase
"""
primitive type Kind 16 end

# The implementation of Kind here is basically similar to @enum. However we use
# the K_str macro to self-name these kinds with their literal representation,
# rather than needing to invent a new name for each.

let kind_int_type = :UInt16
# Preprocess _kind_names to conflate category markers with the first/last
# in the category.
kindstr_to_int = Dict{String,UInt16}()
i = 1
while i <= length(_kind_names)
kn = _kind_names[i]
kind_int = i-1
if startswith(kn, "BEGIN_")
deleteat!(_kind_names, i)
elseif startswith(kn, "END_")
kind_int = i-2
deleteat!(_kind_names, i)
else
i += 1
end
push!(kindstr_to_int, kn=>kind_int)
end

max_kind_int = length(_kind_names)-1

@eval begin
function Kind(x::Integer)
if x < 0 || x > $max_kind_int
throw(ArgumentError("Kind out of range: $x"))
end
return Base.bitcast(Kind, convert($kind_int_type, x))
end

Base.convert(::Type{String}, k::Kind) = _kind_names[1 + reinterpret($kind_int_type, k)]

let kindstr_to_int=$kindstr_to_int
function Base.convert(::Type{Kind}, s::AbstractString)
i = get(kindstr_to_int, s) do
error("unknown Kind name $(repr(s))")
end
Kind(i)
end
end

Base.string(x::Kind) = convert(String, x)
Base.print(io::IO, x::Kind) = print(io, convert(String, x))

Base.typemin(::Type{Kind}) = Kind(0)
Base.typemax(::Type{Kind}) = Kind($max_kind_int)

Base.:<(x::Kind, y::Kind) = reinterpret($kind_int_type, x) < reinterpret($kind_int_type, y)

Base.instances(::Type{Kind}) = (Kind(i) for i in reinterpret($kind_int_type, typemin(Kind)):reinterpret($kind_int_type, typemax(Kind)))
end
end

function Base.show(io::IO, k::Kind)
print(io, "K\"$(convert(String, k))\"")
end

# Save the string representation rather than the bit pattern so that kinds
# can be serialized and deserialized across different JuliaSyntax versions.
function Base.write(io::IO, k::Kind)
str = convert(String, k)
write(io, UInt8(length(str))) + write(io, str)
end
function Base.read(io::IO, ::Type{Kind})
len = read(io, UInt8)
str = String(read(io, len))
convert(Kind, str)
end

#-------------------------------------------------------------------------------

"""
K"s"
The kind of a token or AST internal node with string "s".
For example
* K")" is the kind of the right parenthesis token
* K"block" is the kind of a block of code (eg, statements within a begin-end).
"""
macro K_str(s)
convert(Kind, s)
end

"""
A set of kinds which can be used with the `in` operator. For example
k in KSet"+ - *"
"""
macro KSet_str(str)
kinds = [convert(Kind, s) for s in split(str)]

quote
($(kinds...),)
end
end

"""
kind(x)
Return the `Kind` of `x`.
"""
kind(k::Kind) = k
])

#-------------------------------------------------------------------------------
const _nonunique_kind_names = Set([
Expand Down
59 changes: 59 additions & 0 deletions test/kinds.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Only test this once per session, as kind modules must be unique (ugh)
if !isdefined(@__MODULE__, :FooKinds)
@eval module FooKinds

using JuliaSyntax

function _init_kinds()
JuliaSyntax.register_kinds!(@__MODULE__, 42, [
"BEGIN_FOO"
"foo_1"
"foo_2"
"BEGIN_FOOBAR"
"foobar_1"
"foobar_2"
"END_FOOBAR"
"END_FOO"
])
end

_init_kinds()

k_before_init = K"foo_1"

function __init__()
_init_kinds()
end

end

@eval module BarKinds
# Intentionally empty
end

end

@testset "Kinds" begin
@test K"foo_1" != K"foo_2"

@test FooKinds.k_before_init == K"foo_1"

@test K"BEGIN_FOO" == K"foo_1"
@test K"foo_2" < K"BEGIN_FOOBAR"
@test K"BEGIN_FOOBAR" == K"foobar_1"
@test K"END_FOOBAR" == K"foobar_2"
@test K"END_FOO" == K"foobar_2"

@test parentmodule(K"foo_1") == FooKinds
@test sprint(show, K"foo_1") == "K\"foo_1\""

# Too many kind modules
@test_throws ErrorException JuliaSyntax.register_kinds!(BarKinds, 64, ["hoo?"])
# Too many kind names per module
@test_throws ErrorException JuliaSyntax.register_kinds!(BarKinds, 42, string.(1:1024))
# Re-registering or registering new kinds is not supported
@test_throws ErrorException JuliaSyntax.register_kinds!(FooKinds, 42, ["foo_2", "foo_1"])
@test_throws ErrorException JuliaSyntax.register_kinds!(FooKinds, 42, ["foo_3"])
# Module ID already taken by FooKinds
@test_throws ErrorException JuliaSyntax.register_kinds!(BarKinds, 42, ["hii?"])
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ include("test_utils_tests.jl")
include("fuzz_test.jl")

include("utils.jl")
include("kinds.jl")

@testset "Tokenize" begin
include("tokenize.jl")
Expand Down

0 comments on commit f99b76f

Please sign in to comment.