Skip to content

Commit d680296

Browse files
author
vidy
committed
Refactor concat_text to make it easizer to understand
1 parent 9db0789 commit d680296

File tree

2 files changed

+100
-66
lines changed

2 files changed

+100
-66
lines changed

examples/text.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,8 @@ fn main() {
1313
println!("# page {}", 0 + 1);
1414
for run in flow.runs {
1515
for line in run.lines {
16-
for w in line.words {
17-
println!("{}", w.text);
18-
}
16+
println!("{}", line.words.iter().map(|w| w.text.as_str()).format(" "));
1917
}
2018
}
21-
// for line in flow.lines {
22-
// for w in line.words {
23-
// println!("{}", w.text);
24-
// }
25-
// }
2619
// }
2720
}

src/text.rs

Lines changed: 99 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,76 +6,74 @@ use unicode_normalization::UnicodeNormalization;
66
use crate::{util::avg, flow::{Word, Rect}};
77

88
pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<Item=&'a TextSpan<E>> + Clone) -> Vec<Word> {
9+
let word_gap = analyze_word_gap(items.clone());
910
let mut words: Vec<Word> = vec![];
10-
11-
// Calculate gaps between each char, the unit is em, relative to the font size.
12-
let gaps = items.clone()
13-
.flat_map(|s| {
14-
// the transform matrix is from em space to device space
15-
// so we need to invert it
16-
let tr_inv = s.transform.matrix.inverse();
17-
let pos = (tr_inv * s.transform.vector).x();
1811

19-
s.chars.iter()
20-
.filter(|c| !s.text[c.offset..].chars().next().unwrap().is_whitespace())
21-
.map(move |c| (c.pos + pos, c.pos + pos + c.width, s.font_size))
22-
})
23-
.tuple_windows()
24-
.filter(|(a, b)| b.0 > a.0)
25-
.map(|(a, b)| (b.0 - a.1).max(0.01).min(0.25 * (a.2 + b.2)));
26-
27-
let font_size = avg(items.clone().map(|s| s.font_size)).unwrap();
28-
//gaps.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
29-
let space_gap = (0.5 * font_size).min(2.0 * avg(gaps).unwrap_or(0.0)); //2.0 * gaps[gaps.len()/2];
30-
3112
let mut end = 0.; // trailing edge of the last char
32-
// out中最后一个字符是否是空格
13+
14+
// Whether the last processed TextChar is a space
3315
let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true);
16+
3417
let mut word_start_pos = 0.0;
18+
let mut word_end_pos = 0.0;
19+
3520
let mut word_start_idx = out.len();
3621
let mut y_min = f32::INFINITY;
3722
let mut y_max = -f32::INFINITY;
3823
let mut word_start = true;
39-
let mut word_end = 0.0;
4024

4125
for span in items {
42-
let mut pos = 0; // byte index of last char into span.text
26+
let mut offset = 0; // byte index of last char into span.text
4327
let tr_inv = span.transform.matrix.inverse();
4428
let x_off = (tr_inv * span.transform.vector).x();
4529

46-
for c in span.chars.iter() {
47-
// current string of TextChar
48-
let s = &span.text[pos..c.offset];
49-
if c.offset > 0 {
50-
let is_whitespace = s.chars().all(|c| c.is_whitespace());
51-
// 在不为空格的时候, 将 s 写入 out.
52-
if !trailing_space || !is_whitespace {
53-
out.extend(s.nfkc());
30+
let chars = span.chars.as_slice();
31+
for (i, c) in chars.iter().enumerate() {
32+
let next_offset = chars.get(i + 1).map_or(span.text.len(), |next| next.offset);
33+
let s: &str = &span.text[offset..next_offset];
34+
35+
out.extend(s.nfkc());
36+
37+
let is_whitespace = s.chars().all(|c| c.is_whitespace());
38+
39+
if trailing_space {
40+
if !is_whitespace {
41+
word_start = true;
42+
word_start_idx = out.len() - 1;
5443
}
5544
trailing_space = is_whitespace;
45+
} else {
46+
trailing_space = is_whitespace;
47+
if is_whitespace {
48+
words.push(Word {
49+
text: out[word_start_idx..out.len()-1].into(),
50+
rect: Rect {
51+
x: word_start_pos,
52+
y: y_min,
53+
h: y_max - y_min,
54+
w: word_end_pos - word_start_pos
55+
}
56+
});
57+
} else if c.pos + x_off > end + word_gap {
58+
words.push(Word {
59+
text: out[word_start_idx..].into(),
60+
rect: Rect {
61+
x: word_start_pos,
62+
y: y_min,
63+
h: y_max - y_min,
64+
w: word_end_pos - word_start_pos
65+
}
66+
});
67+
68+
out.push(' ');
69+
trailing_space = true;
70+
word_start = true;
71+
word_start_idx = out.len() - 1;
72+
}
5673
}
57-
// 在 s 不为空格,且有gap 的时候,记录一个 word.
58-
if !trailing_space && c.pos + x_off > end + space_gap {
59-
words.push(Word {
60-
text: out[word_start_idx..].into(),
61-
rect: Rect {
62-
x: word_start_pos,
63-
y: y_min,
64-
h: y_max - y_min,
65-
w: word_end - word_start_pos
66-
}
67-
});
68-
69-
out.push(' ');
70-
trailing_space = true;
71-
word_start = true;
72-
word_start_idx = out.len();
73-
}
74-
pos = c.offset;
74+
7575
end = c.pos + x_off + c.width;
76-
if c.offset == 0 || !trailing_space {
77-
word_end = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
78-
}
76+
word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
7977

8078
if word_start {
8179
y_min = span.rect.min_y();
@@ -86,25 +84,68 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
8684
y_min = y_min.min(span.rect.min_y());
8785
y_max = y_max.max(span.rect.max_y());
8886
}
89-
}
90-
91-
trailing_space = span.text[pos..].chars().all(|c| c.is_whitespace());
9287

93-
out.extend(span.text[pos..].nfkc());
88+
offset = next_offset;
89+
}
9490
}
91+
9592
words.push(Word {
9693
text: out[word_start_idx..].into(),
9794
rect: Rect {
9895
x: word_start_pos,
9996
y: y_min,
10097
h: y_max - y_min,
101-
w: word_end - word_start_pos
98+
w: word_end_pos - word_start_pos
10299
}
103100
});
104-
101+
105102
words
106103
}
107104

105+
/// Calculate gaps between each char,
106+
/// The most important thing here is to make sure the gap is bigger than char gap, and less than word gap.
107+
///
108+
/// for example:
109+
/// think of something like "ab ____________c de"
110+
///
111+
/// a-b has a zero space (or 0.01)
112+
/// b-c has a huge space of 10
113+
/// c-d has 0.2
114+
/// d-e has 0.01
115+
/// if we just take the average = 10.2 and divide that by 4 we get 2.5
116+
/// and now c-d is smaller than that and not classified as a space
117+
/// but if b-c is capped by the threshold of 0.5, the sum is 0.7, and the avg is 0.7/4 ~ 0.18
118+
/// and everything is fine.
119+
120+
/// 0 + min(0.5, 10) + 0.2 + 0
121+
/// 10 capped at 0.5 is0.5
122+
/// min(0, 0.5) + min(10, 0.5) + min(0.2, 0.5) + min(0, 0.5)
123+
/// 0 + 0.5 + 0.2 + 0
124+
/// every value is limited to be at least 0.01 and not more than 0.5.
125+
/// the 0.5 is 0.25 * font size of the left char and 0.25 * font size of the right char
126+
/// if they are the same font size it is 0.5
127+
fn analyze_word_gap<'a, E: Encoder + 'a>(items: impl Iterator<Item=&'a TextSpan<E>> + Clone) -> f32 {
128+
let gaps = items.clone()
129+
.flat_map(|s| {
130+
// the transform matrix is from em space to device space
131+
// so we need to invert it
132+
let tr_inv = s.transform.matrix.inverse();
133+
let pos = (tr_inv * s.transform.vector).x();
134+
135+
s.chars.iter()
136+
.filter(|c| !s.text[c.offset..].chars().next().unwrap().is_whitespace())
137+
.map(move |c| (c.pos + pos, c.pos + pos + c.width, s.font_size))
138+
})
139+
.tuple_windows()
140+
.filter(|(a, b)| b.0 > a.0)
141+
.map(|(a, b)| (b.0 - a.1).max(0.01).min(0.25 * (a.2 + b.2)));
142+
143+
let avg_font_size = avg(items.clone().map(|s| s.font_size)).unwrap();
144+
//gaps.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
145+
146+
(0.5 * avg_font_size).min(2.0 * avg(gaps).unwrap_or(0.0)) //2.0 * gaps[gaps.len()/2];
147+
}
148+
108149
#[cfg(test)]
109150
mod tests {
110151
use pathfinder_geometry::{rect::RectF, transform2d::Transform2F};

0 commit comments

Comments
 (0)