Skip to content

Commit c961d8f

Browse files
author
vidy
committed
Fix word bounds not calculated correctly
1 parent 5e70622 commit c961d8f

File tree

3 files changed

+33
-32
lines changed

3 files changed

+33
-32
lines changed

examples/text.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ fn main() {
1111
let flow = pdf_text::run(&file, &page, &resolver, Default::default(), false).expect("can't render page");
1212

1313
for run in flow.runs {
14-
for line in run.lines {
15-
for word in line.words {
16-
println!("{}", word.text.as_str());
14+
for line in &run.lines {
15+
println!("{:?}", line.rect);
16+
for word in &line.words {
17+
println!("{}, {:?}", word.text.as_str(), word.rect);
1718
// for char in word.chars {
1819
// println!("{:?}", char);
1920
// }

src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
2121
render_page(&mut tracer, resolve, &page, transform)?;
2222

2323
let bbox = tracer.view_box();
24-
2524
let items: Vec<DrawItem<OutlineBuilder>> = tracer.finish();
2625
//Get all patterns which may have lines and texts inside.
2726
let mut patterns = HashSet::new();

src/text.rs

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use crate::{flow::{Char, Rect, Word}, util::avg};
1010
pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<Item=&'a TextSpan<E>> + Clone) -> Vec<Word> {
1111
let word_gap = analyze_word_gap(items.clone());
1212
let mut words = Vec::new();
13-
let mut current_word = WordBuilder::new(out.len());
13+
let mut current_word = WordBuilder::new(out.len(), 0.0);
1414

1515
// Whether the last processed TextChar is a whitespace
1616
// ' ' Space
@@ -20,6 +20,8 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
2020
// '\u{00A0}' Non-breaking space
2121
let mut trailing_space = out.chars().last().map_or(true, |c| c.is_whitespace());
2222

23+
let mut end = 0.; // trailing edge of the last char
24+
2325
for span in items {
2426
let mut offset = 0;
2527
let tr_inv = span.transform.matrix.inverse();
@@ -42,27 +44,29 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
4244

4345
let is_whitespace = text.chars().all(|c| c.is_whitespace());
4446

45-
// byte offset
47+
// byte offsets
4648
let offset_increment = text.len();
4749
// Handle word boundaries
4850
if trailing_space && !is_whitespace {
4951
// Start new word after space
50-
current_word.start_new(out.len(), char_start);
52+
current_word = WordBuilder::new(out.len(),char_start);
5153
current_word.add_char(0, offset_increment, char_start, char_end);
54+
5255
out.extend(text.nfkc());
5356
} else if !trailing_space {
5457
if is_whitespace {
5558
// End word at space
56-
words.push(current_word.build(out, char_end));
57-
current_word = WordBuilder::new(out.len());
59+
words.push(current_word.build(out));
60+
61+
current_word = WordBuilder::new(out.len(),char_start);
5862
out.push(' ');
59-
} else if current.pos + x_off > current_word.end_pos + word_gap {
63+
} else if current.pos + x_off > end + word_gap {
6064
// End word at large gap
61-
words.push(current_word.build(out, char_end));
65+
words.push(current_word.build(out));
6266

63-
current_word = WordBuilder::new(out.len());
64-
current_word.start_new(out.len(), char_start);
67+
current_word = WordBuilder::new(out.len(), char_start);
6568
current_word.add_char(0, offset_increment, char_start, char_end);
69+
6670
out.extend(text.nfkc());
6771
} else {
6872
// Continue current word
@@ -71,16 +75,17 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
7175
out.extend(text.nfkc());
7276
}
7377
}
74-
7578
trailing_space = is_whitespace;
79+
80+
end = current.pos + x_off + current.width;
81+
7682
current_word.update_bounds(span.rect.min_y(), span.rect.max_y());
7783
}
7884
}
7985

8086
// Add final word if any
8187
if !current_word.is_empty() {
82-
let end_pos = current_word.end_pos;
83-
words.push(current_word.build(out, end_pos));
88+
words.push(current_word.build(out));
8489
}
8590

8691
words
@@ -102,40 +107,36 @@ struct WordBuilder {
102107
}
103108

104109
impl WordBuilder {
105-
fn new(word_start_idx: usize) -> Self {
110+
fn new(word_start_idx: usize, start_pos: f32) -> Self {
106111
Self {
107112
word_start_idx,
108-
start_pos: 0.0,
113+
start_pos,
109114
end_pos: 0.0,
110115
y_min: f32::INFINITY,
111116
y_max: -f32::INFINITY,
112117
chars: Vec::new(),
113118
byte_offset: 0,
114-
started: false,
119+
started: true,
115120
}
116121
}
117122

118-
fn start_new(&mut self, word_start_idx: usize, start_pos: f32) {
119-
self.word_start_idx = word_start_idx;
120-
self.start_pos = start_pos;
121-
self.started = true;
122-
}
123-
124123
fn add_char(&mut self, offset: usize, offset_increment: usize, start: f32, end: f32) {
125124
self.chars.push(Char {
126125
offset,
127126
pos: start,
128127
width: end - start,
129128
});
130129
self.end_pos = end;
130+
131131
self.byte_offset += offset_increment;
132132
}
133133

134134
fn update_bounds(&mut self, min_y: f32, max_y: f32) {
135-
if !self.started {
135+
if self.started {
136136
self.y_min = min_y;
137137
self.y_max = max_y;
138-
self.started = true;
138+
139+
self.started = false;
139140
} else {
140141
self.y_min = self.y_min.min(min_y);
141142
self.y_max = self.y_max.max(max_y);
@@ -146,23 +147,23 @@ impl WordBuilder {
146147
self.chars.is_empty()
147148
}
148149

149-
fn build(mut self, out: &str, end_pos: f32) -> Word {
150+
fn build(mut self, out: &str) -> Word {
150151
Word {
151152
text: out[self.word_start_idx..].into(),
152153
rect: Rect {
153154
x: self.start_pos,
154155
y: self.y_min,
155156
h: self.y_max - self.y_min,
156-
w: end_pos - self.start_pos
157+
w: self.end_pos - self.start_pos
157158
},
158159
chars: take(&mut self.chars)
159160
}
160161
}
161162
}
162163

163-
/// Calculate gaps between each char,
164+
/// Calculate gaps between each char, the return value unit is em
165+
164166
/// The most important thing here is to make sure the gap is bigger than char gap, and less than word gap.
165-
///
166167
/// for example:
167168
/// think of something like "ab____________c de"
168169
///
@@ -186,7 +187,7 @@ fn analyze_word_gap<'a, E: Encoder + 'a>(items: impl Iterator<Item=&'a TextSpan<
186187
let gaps = items.clone()
187188
.flat_map(|s| {
188189
// the transform matrix is from em space to device space
189-
// so we need to invert it
190+
// so we need to invert it, becoming device space to em space
190191
let tr_inv = s.transform.matrix.inverse();
191192
let pos = (tr_inv * s.transform.vector).x();
192193

0 commit comments

Comments
 (0)