Skip to content

Commit 0d219c0

Browse files
committed
Improve read perf, store coords as SVector
This substantially reduces the amount of memory used to store coordinates and parse files. While parsing a large mmCIF file, the memory usage dropped by approximately 50%, and read time by ~15%. The most potentially-disruptive change is that the coordinates are now stored as SVectors instead of Vectors. This means that the coordinates are now immutable, and you cannot change them in place by manual indexing. The `x!`, `y!`, and `z!` functions still work, as do in-place transformations, by making the `coords` field itself mutable.
1 parent 27e02c7 commit 0d219c0

File tree

6 files changed

+64
-61
lines changed

6 files changed

+64
-61
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
1212
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1313
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
1414
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
15+
StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
1516
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1617

1718
[weakdeps]
@@ -50,5 +51,6 @@ MetaGraphs = "0.7, 0.8"
5051
PrecompileTools = "1"
5152
RecipesBase = "1"
5253
STRIDE_jll = "1"
54+
StaticArraysCore = "1"
5355
Statistics = "1.9"
5456
julia = "1.9"

src/BioStructures.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ using PrecompileTools
1818
using RecipesBase
1919

2020
using LinearAlgebra
21+
using StaticArraysCore
2122
using Statistics
2223

2324
include("model.jl")

src/mmcif.jl

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,7 @@ end
104104

105105
# Split a mmCIF line into tokens
106106
# See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax
107-
function splitline(s::AbstractString)
108-
tokens = String[]
107+
function splitline!(tokens, s::AbstractString)
109108
in_token = false
110109
# Quote character of the currently open quote, or ' ' if no quote open
111110
quote_open_char = ' '
@@ -114,7 +113,7 @@ function splitline(s::AbstractString)
114113
if c in whitespacechars
115114
if in_token && quote_open_char == ' '
116115
in_token = false
117-
push!(tokens, s[start_i:(i - 1)])
116+
push!(tokens, @view(s[start_i:(i - 1)]))
118117
end
119118
elseif c in quotechars
120119
if quote_open_char == ' '
@@ -127,7 +126,7 @@ function splitline(s::AbstractString)
127126
elseif c == quote_open_char && (i == length(s) || s[i + 1] in whitespacechars)
128127
quote_open_char = ' '
129128
in_token = false
130-
push!(tokens, s[start_i:(i - 1)])
129+
push!(tokens, @view(s[start_i:(i - 1)]))
131130
end
132131
elseif c == '#' && !in_token
133132
return tokens
@@ -137,13 +136,14 @@ function splitline(s::AbstractString)
137136
end
138137
end
139138
if in_token
140-
push!(tokens, s[start_i:end])
139+
push!(tokens, @view(s[start_i:end]))
141140
end
142141
if quote_open_char != ' '
143142
throw(ArgumentError("Line ended with quote open: $s"))
144143
end
145144
return tokens
146145
end
146+
splitline(s::AbstractString) = splitline!(String[], s) # mostly for testing
147147

148148
# Get tokens from a mmCIF file
149149
function tokenizecif(f::IO)
@@ -162,7 +162,7 @@ function tokenizecif(f::IO)
162162
end
163163
push!(tokens, join(token_buffer, "\n"))
164164
else
165-
append!(tokens, splitline(line))
165+
splitline!(tokens, line)
166166
end
167167
end
168168
return tokens
@@ -204,7 +204,7 @@ function tokenizecifstructure(f::IO)
204204
in_keys = true
205205
else
206206
in_keys = false
207-
append!(tokens, splitline(line))
207+
splitline!(tokens, line)
208208
end
209209
end
210210
return tokens
@@ -236,7 +236,7 @@ function MMCIFDict(f::IO; gzip::Bool=false)
236236
end
237237

238238
# Add tokens to a mmCIF dictionary
239-
function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
239+
function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString})
240240
key = ""
241241
keys = String[]
242242
loop_flag = false
@@ -264,16 +264,8 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
264264
continue
265265
end
266266
else
267-
try
268-
push!(mmcif_dict[keys[i % n + 1]], token)
269-
catch ex
270-
# A zero division error means we have not found any keys
271-
if isa(ex, DivideError)
272-
throw(ArgumentError("Loop keys not found, token: \"$token\""))
273-
else
274-
rethrow()
275-
end
276-
end
267+
iszero(n) && throw(ArgumentError("Loop keys not found, token: \"$token\""))
268+
push!(mmcif_dict[keys[i % n + 1]], token)
277269
i += 1
278270
continue
279271
end
@@ -384,25 +376,34 @@ function MolecularStructure(mmcif_dict::MMCIFDict;
384376
end
385377

386378
# Constructor from mmCIF ATOM/HETATM line
387-
AtomRecord(d::MMCIFDict, i::Integer) = AtomRecord(
388-
d["_atom_site.group_PDB"][i] == "HETATM",
389-
parse(Int, d["_atom_site.id"][i]),
390-
d["_atom_site.auth_atom_id"][i],
391-
d["_atom_site.label_alt_id"][i] in missingvals ? ' ' : d["_atom_site.label_alt_id"][i][1],
392-
d["_atom_site.auth_comp_id"][i],
393-
d["_atom_site.auth_asym_id"][i],
394-
parse(Int, d["_atom_site.auth_seq_id"][i]),
395-
d["_atom_site.pdbx_PDB_ins_code"][i] in missingvals ? ' ' : d["_atom_site.pdbx_PDB_ins_code"][i][1],
396-
[
397-
parse(Float64, d["_atom_site.Cartn_x"][i]),
398-
parse(Float64, d["_atom_site.Cartn_y"][i]),
399-
parse(Float64, d["_atom_site.Cartn_z"][i])
400-
],
401-
d["_atom_site.occupancy"][i] in missingvals ? 1.0 : parse(Float64, d["_atom_site.occupancy"][i]),
402-
d["_atom_site.B_iso_or_equiv"][i] in missingvals ? 0.0 : parse(Float64, d["_atom_site.B_iso_or_equiv"][i]),
403-
d["_atom_site.type_symbol"][i] in missingvals ? " " : d["_atom_site.type_symbol"][i],
404-
d["_atom_site.pdbx_formal_charge"][i] in missingvals ? " " : d["_atom_site.pdbx_formal_charge"][i],
405-
)
379+
function AtomRecord(d::MMCIFDict, i::Integer)
380+
alt_id = d["_atom_site.label_alt_id"][i]
381+
ins_code = d["_atom_site.pdbx_PDB_ins_code"][i]
382+
occupancy = d["_atom_site.occupancy"][i]
383+
temp_factor = d["_atom_site.B_iso_or_equiv"][i]
384+
typesym = d["_atom_site.type_symbol"][i]
385+
charge = d["_atom_site.pdbx_formal_charge"][i]
386+
387+
return AtomRecord(
388+
d["_atom_site.group_PDB"][i] == "HETATM",
389+
parse(Int, d["_atom_site.id"][i]),
390+
d["_atom_site.auth_atom_id"][i],
391+
alt_id in missingvals ? ' ' : alt_id[1],
392+
d["_atom_site.auth_comp_id"][i],
393+
d["_atom_site.auth_asym_id"][i],
394+
parse(Int, d["_atom_site.auth_seq_id"][i]),
395+
ins_code in missingvals ? ' ' : ins_code[1],
396+
SVector{3,Float64}((
397+
parse(Float64, d["_atom_site.Cartn_x"][i]),
398+
parse(Float64, d["_atom_site.Cartn_y"][i]),
399+
parse(Float64, d["_atom_site.Cartn_z"][i]),
400+
)),
401+
occupancy in missingvals ? 1.0 : parse(Float64, occupancy),
402+
temp_factor in missingvals ? 0.0 : parse(Float64, temp_factor),
403+
typesym in missingvals ? " " : typesym,
404+
charge in missingvals ? " " : charge,
405+
)
406+
end
406407

407408
# Format a mmCIF data value by enclosing with quotes or semicolon lines where
408409
# appropriate. See

src/model.jl

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -102,20 +102,20 @@ end
102102
Base.showerror(io::IO, e::PDBConsistencyError) = print(io, "PDBConsistencyError: ", e.message)
103103

104104
"An atom that is part of a macromolecule."
105-
struct Atom <: AbstractAtom
106-
serial::Int
107-
name::String
108-
alt_loc_id::Char
109-
coords::Vector{Float64}
110-
occupancy::Float64
111-
temp_factor::Float64
112-
element::String
113-
charge::String
114-
residue::StructuralElement
105+
mutable struct Atom <: AbstractAtom
106+
const serial::Int
107+
const name::String
108+
const alt_loc_id::Char
109+
coords::SVector{3,Float64}
110+
const occupancy::Float64
111+
const temp_factor::Float64
112+
const element::String
113+
const charge::String
114+
const residue::StructuralElement
115115
end
116116

117117
function Atom(a::Atom, r::StructuralElement)
118-
return Atom(a.serial, a.name, a.alt_loc_id, copy(a.coords), a.occupancy,
118+
return Atom(a.serial, a.name, a.alt_loc_id, a.coords, a.occupancy,
119119
a.temp_factor, a.element, a.charge, r)
120120
end
121121

@@ -234,7 +234,7 @@ struct AtomRecord
234234
chain_id::String
235235
res_number::Int
236236
ins_code::Char
237-
coords::Vector{Float64}
237+
coords::SVector{3,Float64}
238238
occupancy::Float64
239239
temp_factor::Float64
240240
element::String
@@ -483,7 +483,7 @@ Set the x coordinate in Å of an `AbstractAtom` to `val`.
483483
484484
For `DisorderedAtom`s only the default atom is updated.
485485
"""
486-
x!(at::Atom, x::Real) = (at.coords[1] = x; at)
486+
x!(at::Atom, x::Real) = (at.coords = SVector{3,Float64}((x, at.coords[2], at.coords[3])))
487487
x!(dis_at::DisorderedAtom, x::Real) = x!(defaultatom(dis_at), x)
488488

489489
"""
@@ -501,7 +501,7 @@ Set the y coordinate in Å of an `AbstractAtom` to `val`.
501501
502502
For `DisorderedAtom`s only the default atom is updated.
503503
"""
504-
y!(at::Atom, y::Real) = (at.coords[2] = y; at)
504+
y!(at::Atom, y::Real) = (at.coords = SVector{3,Float64}((at.coords[1], y, at.coords[3])))
505505
y!(dis_at::DisorderedAtom, y::Real) = y!(defaultatom(dis_at), y)
506506

507507
"""
@@ -519,31 +519,30 @@ Set the z coordinate in Å of an `AbstractAtom` to `val`.
519519
520520
For `DisorderedAtom`s only the default atom is updated.
521521
"""
522-
z!(at::Atom, z::Real) = (at.coords[3] = z; at)
522+
z!(at::Atom, z::Real) = (at.coords = SVector{3,Float64}((at.coords[1], at.coords[2], z)))
523523
z!(dis_at::DisorderedAtom, z::Real) = z!(defaultatom(dis_at), z)
524524

525525
"""
526526
coords(at)
527527
528-
Get the coordinates in Å of an `AbstractAtom` as a `Vector{Float64}`.
528+
Get the coordinates in Å of an `AbstractAtom` as a `SVector{3,Float64}`.
529529
"""
530530
coords(at::Atom) = at.coords
531531
coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at))
532532

533533
"""
534534
coords!(at, new_coords)
535535
536-
Set the coordinates in Å of an `AbstractAtom` to a `Vector` of 3 numbers.
536+
Set the coordinates in Å of an `AbstractAtom` to `new_coords`, an iterable of 3 numbers.
537537
538538
For `DisorderedAtom`s only the default atom is updated.
539539
"""
540540
function coords!(at::Atom, new_coords)
541541
if length(new_coords) != 3
542542
throw(ArgumentError("3 coordinates must be given"))
543543
end
544-
x!(at, new_coords[1])
545-
y!(at, new_coords[2])
546-
z!(at, new_coords[3])
544+
x, y, z = new_coords
545+
at.coords = SVector{3,Float64}((x, y, z))
547546
return at
548547
end
549548

@@ -784,7 +783,7 @@ function resid(hetatm::Bool, resnum::Int, inscode::Char)
784783
end
785784
else
786785
if inscode == ' '
787-
return "$resnum"
786+
return string(resnum)
788787
else
789788
return "$resnum$inscode"
790789
end

src/pdb.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,11 @@ function AtomRecord(pdb_line::String, line_n::Integer=1)
127127
parsechainid(pdb_line, line_n),
128128
parseresnumber(pdb_line, line_n),
129129
parseinscode(pdb_line, line_n),
130-
[
130+
SVector{3,Float64}((
131131
parsecoordx(pdb_line, line_n),
132132
parsecoordy(pdb_line, line_n),
133133
parsecoordz(pdb_line, line_n)
134-
],
134+
)),
135135
n >= 60 ? parseoccupancy(pdb_line) : 1.0,
136136
n >= 66 ? parsetempfac(pdb_line) : 0.0,
137137
n >= 78 ? parseelement(pdb_line) : " ",

test/runtests.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ end
246246
testparent(getchildren(struc), struc)
247247
struc_copy = copy(struc)
248248
testparent(getchildren(struc_copy), struc_copy)
249-
struc_copy['A'][10]["CA"].coords[2] = 100
249+
y!(struc_copy['A'][10]["CA"], 100)
250250
@test struc_copy['A'][10]["CA"].coords[2] == 100
251251
@test a.coords[2] == 2
252252
@test struc['A'][10]["CA"].coords[2] == 2

0 commit comments

Comments
 (0)