Improve read perf, store coords as SVector

timholy · timholy · commit 0d219c09ff57 · 2025-05-08T05:52:52.000-05:00
This substantially reduces the amount of memory used to store
coordinates and parse files. While parsing a large mmCIF file, the
memory usage dropped by approximately 50%, and read time by ~15%.

The most potentially-disruptive change is that the coordinates are now
stored as SVectors instead of Vectors. This means that the coordinates
are now immutable, and you cannot change them in place by manual
indexing. The `x!`, `y!`, and `z!` functions still work, as do in-place
transformations, by making the `coords` field itself mutable.
diff --git a/Project.toml b/Project.toml
@@ -12,6 +12,7 @@ Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
@@ -50,5 +51,6 @@ MetaGraphs = "0.7, 0.8"
 PrecompileTools = "1"
 RecipesBase = "1"
 STRIDE_jll = "1"
+StaticArraysCore = "1"
 Statistics = "1.9"
 julia = "1.9"
diff --git a/src/BioStructures.jl b/src/BioStructures.jl
@@ -18,6 +18,7 @@ using PrecompileTools
 using RecipesBase
 
 using LinearAlgebra
+using StaticArraysCore
 using Statistics
 
 include("model.jl")
diff --git a/src/mmcif.jl b/src/mmcif.jl
@@ -104,8 +104,7 @@ end
 
 # Split a mmCIF line into tokens
 # See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax
-function splitline(s::AbstractString)
-    tokens = String[]
+function splitline!(tokens, s::AbstractString)
     in_token = false
     # Quote character of the currently open quote, or ' ' if no quote open
     quote_open_char = ' '
@@ -114,7 +113,7 @@ function splitline(s::AbstractString)
         if c in whitespacechars
             if in_token && quote_open_char == ' '
                 in_token = false
-                push!(tokens, s[start_i:(i - 1)])
+                push!(tokens, @view(s[start_i:(i - 1)]))
             end
         elseif c in quotechars
             if quote_open_char == ' '
@@ -127,7 +126,7 @@ function splitline(s::AbstractString)
             elseif c == quote_open_char && (i == length(s) || s[i + 1] in whitespacechars)
                 quote_open_char = ' '
                 in_token = false
-                push!(tokens, s[start_i:(i - 1)])
+                push!(tokens, @view(s[start_i:(i - 1)]))
             end
         elseif c == '#' && !in_token
             return tokens
@@ -137,13 +136,14 @@ function splitline(s::AbstractString)
         end
     end
     if in_token
-        push!(tokens, s[start_i:end])
+        push!(tokens, @view(s[start_i:end]))
     end
     if quote_open_char != ' '
         throw(ArgumentError("Line ended with quote open: $s"))
     end
     return tokens
 end
+splitline(s::AbstractString) = splitline!(String[], s)   # mostly for testing
 
 # Get tokens from a mmCIF file
 function tokenizecif(f::IO)
@@ -162,7 +162,7 @@ function tokenizecif(f::IO)
             end
             push!(tokens, join(token_buffer, "\n"))
         else
-            append!(tokens, splitline(line))
+            splitline!(tokens, line)
         end
     end
     return tokens
@@ -204,7 +204,7 @@ function tokenizecifstructure(f::IO)
             in_keys = true
         else
             in_keys = false
-            append!(tokens, splitline(line))
+            splitline!(tokens, line)
         end
     end
     return tokens
@@ -236,7 +236,7 @@ function MMCIFDict(f::IO; gzip::Bool=false)
 end
 
 # Add tokens to a mmCIF dictionary
-function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
+function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString})
     key = ""
     keys = String[]
     loop_flag = false
@@ -264,16 +264,8 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
                     continue
                 end
             else
-                try
-                    push!(mmcif_dict[keys[i % n + 1]], token)
-                catch ex
-                    # A zero division error means we have not found any keys
-                    if isa(ex, DivideError)
-                        throw(ArgumentError("Loop keys not found, token: \"$token\""))
-                    else
-                        rethrow()
-                    end
-                end
+                iszero(n) && throw(ArgumentError("Loop keys not found, token: \"$token\""))
+                push!(mmcif_dict[keys[i % n + 1]], token)
                 i += 1
                 continue
             end
@@ -384,25 +376,34 @@ function MolecularStructure(mmcif_dict::MMCIFDict;
 end
 
 # Constructor from mmCIF ATOM/HETATM line
-AtomRecord(d::MMCIFDict, i::Integer) = AtomRecord(
-    d["_atom_site.group_PDB"][i] == "HETATM",
-    parse(Int, d["_atom_site.id"][i]),
-    d["_atom_site.auth_atom_id"][i],
-    d["_atom_site.label_alt_id"][i] in missingvals ? ' ' : d["_atom_site.label_alt_id"][i][1],
-    d["_atom_site.auth_comp_id"][i],
-    d["_atom_site.auth_asym_id"][i],
-    parse(Int, d["_atom_site.auth_seq_id"][i]),
-    d["_atom_site.pdbx_PDB_ins_code"][i] in missingvals ? ' ' : d["_atom_site.pdbx_PDB_ins_code"][i][1],
-    [
-        parse(Float64, d["_atom_site.Cartn_x"][i]),
-        parse(Float64, d["_atom_site.Cartn_y"][i]),
-        parse(Float64, d["_atom_site.Cartn_z"][i])
-    ],
-    d["_atom_site.occupancy"][i] in missingvals ? 1.0 : parse(Float64, d["_atom_site.occupancy"][i]),
-    d["_atom_site.B_iso_or_equiv"][i] in missingvals ? 0.0 : parse(Float64, d["_atom_site.B_iso_or_equiv"][i]),
-    d["_atom_site.type_symbol"][i] in missingvals ? "  " : d["_atom_site.type_symbol"][i],
-    d["_atom_site.pdbx_formal_charge"][i] in missingvals ? "  " : d["_atom_site.pdbx_formal_charge"][i],
-)
+function AtomRecord(d::MMCIFDict, i::Integer)
+    alt_id = d["_atom_site.label_alt_id"][i]
+    ins_code = d["_atom_site.pdbx_PDB_ins_code"][i]
+    occupancy = d["_atom_site.occupancy"][i]
+    temp_factor = d["_atom_site.B_iso_or_equiv"][i]
+    typesym = d["_atom_site.type_symbol"][i]
+    charge = d["_atom_site.pdbx_formal_charge"][i]
+
+    return AtomRecord(
+        d["_atom_site.group_PDB"][i] == "HETATM",
+        parse(Int, d["_atom_site.id"][i]),
+        d["_atom_site.auth_atom_id"][i],
+        alt_id in missingvals ? ' ' : alt_id[1],
+        d["_atom_site.auth_comp_id"][i],
+        d["_atom_site.auth_asym_id"][i],
+        parse(Int, d["_atom_site.auth_seq_id"][i]),
+        ins_code in missingvals ? ' ' : ins_code[1],
+        SVector{3,Float64}((
+            parse(Float64, d["_atom_site.Cartn_x"][i]),
+            parse(Float64, d["_atom_site.Cartn_y"][i]),
+            parse(Float64, d["_atom_site.Cartn_z"][i]),
+        )),
+        occupancy in missingvals ? 1.0 : parse(Float64, occupancy),
+        temp_factor in missingvals ? 0.0 : parse(Float64, temp_factor),
+        typesym in missingvals ? "  " : typesym,
+        charge in missingvals ? "  " : charge,
+    )
+end
 
 # Format a mmCIF data value by enclosing with quotes or semicolon lines where
 #   appropriate. See
diff --git a/src/model.jl b/src/model.jl
@@ -102,20 +102,20 @@ end
 Base.showerror(io::IO, e::PDBConsistencyError) = print(io, "PDBConsistencyError: ", e.message)
 
 "An atom that is part of a macromolecule."
-struct Atom <: AbstractAtom
-    serial::Int
-    name::String
-    alt_loc_id::Char
-    coords::Vector{Float64}
-    occupancy::Float64
-    temp_factor::Float64
-    element::String
-    charge::String
-    residue::StructuralElement
+mutable struct Atom <: AbstractAtom
+    const serial::Int
+    const name::String
+    const alt_loc_id::Char
+    coords::SVector{3,Float64}
+    const occupancy::Float64
+    const temp_factor::Float64
+    const element::String
+    const charge::String
+    const residue::StructuralElement
 end
 
 function Atom(a::Atom, r::StructuralElement)
-    return Atom(a.serial, a.name, a.alt_loc_id, copy(a.coords), a.occupancy,
+    return Atom(a.serial, a.name, a.alt_loc_id, a.coords, a.occupancy,
                 a.temp_factor, a.element, a.charge, r)
 end
 
@@ -234,7 +234,7 @@ struct AtomRecord
     chain_id::String
     res_number::Int
     ins_code::Char
-    coords::Vector{Float64}
+    coords::SVector{3,Float64}
     occupancy::Float64
     temp_factor::Float64
     element::String
@@ -483,7 +483,7 @@ Set the x coordinate in Å of an `AbstractAtom` to `val`.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
-x!(at::Atom, x::Real) = (at.coords[1] = x; at)
+x!(at::Atom, x::Real) = (at.coords = SVector{3,Float64}((x, at.coords[2], at.coords[3])))
 x!(dis_at::DisorderedAtom, x::Real) = x!(defaultatom(dis_at), x)
 
 """
@@ -501,7 +501,7 @@ Set the y coordinate in Å of an `AbstractAtom` to `val`.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
-y!(at::Atom, y::Real) = (at.coords[2] = y; at)
+y!(at::Atom, y::Real) = (at.coords = SVector{3,Float64}((at.coords[1], y, at.coords[3])))
 y!(dis_at::DisorderedAtom, y::Real) = y!(defaultatom(dis_at), y)
 
 """
@@ -519,31 +519,30 @@ Set the z coordinate in Å of an `AbstractAtom` to `val`.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
-z!(at::Atom, z::Real) = (at.coords[3] = z; at)
+z!(at::Atom, z::Real) = (at.coords = SVector{3,Float64}((at.coords[1], at.coords[2], z)))
 z!(dis_at::DisorderedAtom, z::Real) = z!(defaultatom(dis_at), z)
 
 """
     coords(at)
 
-Get the coordinates in Å of an `AbstractAtom` as a `Vector{Float64}`.
+Get the coordinates in Å of an `AbstractAtom` as a `SVector{3,Float64}`.
 """
 coords(at::Atom) = at.coords
 coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at))
 
 """
     coords!(at, new_coords)
 
-Set the coordinates in Å of an `AbstractAtom` to a `Vector` of 3 numbers.
+Set the coordinates in Å of an `AbstractAtom` to `new_coords`, an iterable of 3 numbers.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
 function coords!(at::Atom, new_coords)
     if length(new_coords) != 3
         throw(ArgumentError("3 coordinates must be given"))
     end
-    x!(at, new_coords[1])
-    y!(at, new_coords[2])
-    z!(at, new_coords[3])
+    x, y, z = new_coords
+    at.coords = SVector{3,Float64}((x, y, z))
     return at
 end
 
@@ -784,7 +783,7 @@ function resid(hetatm::Bool, resnum::Int, inscode::Char)
         end
     else
         if inscode == ' '
-            return "$resnum"
+            return string(resnum)
         else
             return "$resnum$inscode"
         end
diff --git a/src/pdb.jl b/src/pdb.jl
@@ -127,11 +127,11 @@ function AtomRecord(pdb_line::String, line_n::Integer=1)
         parsechainid(pdb_line, line_n),
         parseresnumber(pdb_line, line_n),
         parseinscode(pdb_line, line_n),
-        [
+        SVector{3,Float64}((
             parsecoordx(pdb_line, line_n),
             parsecoordy(pdb_line, line_n),
             parsecoordz(pdb_line, line_n)
-        ],
+        )),
         n >= 60 ? parseoccupancy(pdb_line) : 1.0,
         n >= 66 ? parsetempfac(pdb_line) : 0.0,
         n >= 78 ? parseelement(pdb_line) : "  ",
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -246,7 +246,7 @@ end
     testparent(getchildren(struc), struc)
     struc_copy = copy(struc)
     testparent(getchildren(struc_copy), struc_copy)
-    struc_copy['A'][10]["CA"].coords[2] = 100
+    y!(struc_copy['A'][10]["CA"], 100)
     @test struc_copy['A'][10]["CA"].coords[2] == 100
     @test a.coords[2] == 2
     @test struc['A'][10]["CA"].coords[2] == 2