Skip to content

Refactor to use syllable (WIP) #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 46 additions & 40 deletions bogo/accent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This file is part of ibus-bogo project.
#
# Copyright (C) 2012 Long T. Dam <[email protected]>
# Copyright (C) 2012-2013 Trung Ngo <[email protected]>
# Copyright (C) 2012-2014 Trung Ngo <[email protected]>
# Copyright (C) 2013 Duong H. Nguyen <[email protected]>
#
# ibus-bogo is free software: you can redistribute it and/or modify
Expand All @@ -21,7 +21,7 @@
#

"""
Utility functions to deal with accents (should have been called tones),
Utility functions to deal with accents (also called tones),
which are diacritical markings that changes the pitch of a character.
E.g. the acute accent in á.
"""
Expand All @@ -31,9 +31,11 @@

from __future__ import unicode_literals
from bogo import utils
from bogo.syllable import Syllable


class Accent:
MAX_VALUE = 6
GRAVE = 5
ACUTE = 4
HOOK = 3
Expand Down Expand Up @@ -62,61 +64,65 @@ def get_accent_string(string):
return accents[-1] if accents else Accent.NONE


def add_accent(components, accent):
def add_accent(syllable, accent):
"""
Add accent to the given components. The parameter components is
the result of function separate()
Add accent to the given syllable.
"""
vowel = components[1]
last_consonant = components[2]
vowel = syllable.vowel

if not vowel:
return syllable

if accent == Accent.NONE:
vowel = remove_accent_string(vowel)
return [components[0], vowel, last_consonant]
return Syllable(syllable.initial_consonant, vowel, syllable.final_consonant)

if vowel == "":
return components
#raw_string is a list, not a str object
raw_string = remove_accent_string(vowel).lower()
new_vowel = ""
vowel_wo_accent = remove_accent_string(vowel).lower()
new_vowel = ''

# Highest priority for ê and ơ
index = max(raw_string.find("ê"), raw_string.find("ơ"))
if index != -1:
new_vowel = vowel[:index] + add_accent_char(vowel[index], accent) + vowel[index+1:]
elif len(vowel) == 1 or (len(vowel) == 2 and last_consonant == ""):
new_vowel = add_accent_char(vowel[0], accent) + vowel[1:]
index = max(vowel_wo_accent.find("ê"), vowel_wo_accent.find("ơ"))
found_e_hat_or_o_horn = index != -1

if found_e_hat_or_o_horn:
# Add accent mark to the found ê or ơ
new_vowel = \
vowel[:index] + \
add_accent_char(vowel[index], accent) + \
vowel[index + 1:]
elif len(vowel) == 1 or (len(vowel) == 2 and not syllable.final_consonant):
# cá
# cháo
first_vowel_char = vowel[0]
first_vowel_char_with_accent = add_accent_char(first_vowel_char, accent)
new_vowel = first_vowel_char_with_accent + vowel[1:]
else:
new_vowel = vowel[:1] + add_accent_char(vowel[1], accent) + vowel[2:]
return [components[0], new_vowel, components[2]]
# biến
# khuỷu
second_vowel_char = vowel[1]
second_vowel_char_with_accent = add_accent_char(second_vowel_char, accent)
new_vowel = vowel[:1] + second_vowel_char_with_accent + vowel[2:]

return Syllable(syllable.initial_consonant, new_vowel, syllable.final_consonant)


@utils.keep_case
def add_accent_char(char, accent):
"""
Add accent to a single char. Parameter accent is member of class
Accent
Add accent to a single char.

Args:
accent: an Accent enum value
"""
if char == "":
return ""
case = char.isupper()
char = char.lower()
if not (char and accent in range(0, Accent.MAX_VALUE + 1)):
return char

index = utils.VOWELS.find(char)
if (index != -1):
index = index - index % 6 + 5
char = utils.VOWELS[index - accent]
return utils.change_case(char, case)


def add_accent_at(string, accent, index):
"""
Add mark to the index-th character of the given string. Return
the new string after applying change.
(unused)
"""
if index == -1:
return string
# Python can handle the case which index is out of range of given string
return string[:index] + \
accent.accent.add_accent_char(string[index], accent) + \
string[index+1:]
return char


def remove_accent_char(char):
Expand Down
82 changes: 82 additions & 0 deletions bogo/syllable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import collections
from bogo import utils


class Syllable(collections.namedtuple('Syllable',
['initial_consonant', 'vowel', 'final_consonant'])):

@staticmethod
def new_from_string(string):
"""\
Make a Syllable from a string.

Args:
- string: the string to be parsed

Returns:
a Syllable

>>> parse_syllable('tuong')
('t','uo','ng')
>>> parse_syllable('ohmyfkinggod')
('ohmyfkingg','o','d')
"""
def atomic_separate(string, last_chars, last_is_vowel):
if string == "" or (last_is_vowel != utils.is_vowel(string[-1])):
return (string, last_chars)
else:
return atomic_separate(string[:-1],
string[-1] + last_chars, last_is_vowel)

head, last_consonant = atomic_separate(string, "", False)
first_consonant, vowel = atomic_separate(head, "", True)

if last_consonant and not (vowel + first_consonant):
first_consonant = last_consonant
last_consonant = ''

# 'gi' and 'qu' are considered qualified consonants.
# We want something like this:
# ['g', 'ia', ''] -> ['gi', 'a', '']
# ['q', 'ua', ''] -> ['qu', 'a', '']
if len(vowel) > 1 and \
(first_consonant + vowel[0]).lower() in ['gi', 'qu']:
first_consonant += vowel[0]
vowel = vowel[1:]

return Syllable(first_consonant, vowel, last_consonant)


def append_char(self, char):
"""
Append a character to `comps` following this rule: a vowel is added
to the vowel part if there is no last consonant, else to the last
consonant part; a consonant is added to the first consonant part
if there is no vowel, and to the last consonant part if the
vowel part is not empty.

>>> transform(['', '', ''])
['c', '', '']
>>> transform(['c', '', ''], '+o')
['c', 'o', '']
>>> transform(['c', 'o', ''], '+n')
['c', 'o', 'n']
>>> transform(['c', 'o', 'n'], '+o')
['c', 'o', 'no']
"""
initial_consonant = self.initial_consonant
vowel = self.vowel
final_consonant = self.final_consonant

if utils.is_vowel(char):
if not self.final_consonant:
vowel = self.vowel + char
else:
final_consonant = self.final_consonant + char
else:
if not self.final_consonant and not self.vowel:
initial_consonant = self.initial_consonant + char
else:
final_consonant = self.final_consonant + char

return Syllable(initial_consonant, vowel, final_consonant)
83 changes: 83 additions & 0 deletions bogo/test/test_accent.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,86 @@
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from nose.tools import eq_
from bogo.accent import add_accent, add_accent_char, Accent
from bogo.syllable import Syllable


class TestAddAccentChar():

def test_empty_char(self):
result = add_accent_char('', Accent.GRAVE)
expected = ''
eq_(result, expected)

def test_out_of_range_accent(self):
result = add_accent_char('a', 293432)
expected = 'a'
eq_(result, expected)

def test_normal_accent(self):
result = add_accent_char('a', Accent.ACUTE)
expected = 'á'
eq_(result, expected)

def test_upper_case(self):
eq_(add_accent_char('A', Accent.ACUTE), 'Á')


class TestAddAccent():
def test_remove_accent(self):
s = Syllable('c', 'á', 'c')

result = add_accent(s, Accent.NONE)
expected = Syllable('c', 'a', 'c')

eq_(result, expected)

def test_e_hat(self):
s = Syllable('ch', 'uyê', 'n')

result = add_accent(s, Accent.HOOK)
expected = Syllable('ch', 'uyể', 'n')

eq_(result, expected)

def test_o_horn(self):
s = Syllable('ch', 'ươ', 'ng')

result = add_accent(s, Accent.HOOK)
expected = Syllable('ch', 'ưở', 'ng')

eq_(result, expected)

def test_double_vowel_no_final_consonant(self):
s = Syllable('c', 'ua', '')

result = add_accent(s, Accent.HOOK)
expected = Syllable('c', 'ủa', '')

eq_(result, expected)

def test_double_vowel_with_final_consonant(self):
s = Syllable('c', 'uô', 'ng')

result = add_accent(s, Accent.GRAVE)
expected = Syllable('c', 'uồ', 'ng')

eq_(result, expected)

def test_single_vowel(self):
s = Syllable('c', 'a', '')

result = add_accent(s, Accent.ACUTE)
expected = Syllable('c', 'á', '')

eq_(result, expected)

s = Syllable('c', 'a', 'n')

result = add_accent(s, Accent.ACUTE)
expected = Syllable('c', 'á', 'n')

eq_(result, expected)


70 changes: 70 additions & 0 deletions bogo/test/test_syllable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from nose.tools import eq_
from bogo.syllable import Syllable


class TestSyllable():

def test_parse_simple_syllable(self):
parsed = Syllable.new_from_string('tuong')

expected = Syllable('t', 'uo', 'ng')
eq_(parsed, expected)

def test_parse_qua(self):
parsed = Syllable.new_from_string('qua')

expected = Syllable('qu', 'a', '')
eq_(parsed, expected)

def test_parse_gia(self):
parsed = Syllable.new_from_string('gia')

expected = Syllable('gi', 'a', '')
eq_(parsed, expected)

def test_parse_gi(self):
parsed = Syllable.new_from_string('gi')

expected = Syllable('g', 'i', '')
eq_(parsed, expected)

def test_parse_rubbish(self):
parsed = Syllable.new_from_string('ohmyfkinggod')

expected = Syllable('ohmyfkingg', 'o', 'd')
eq_(parsed, expected)

def test_append_initial_consonant(self):
s = Syllable('c', '', '')
s = s.append_char('c')

expected = Syllable('cc', '', '')
eq_(s, expected)

def test_append_initial_consonant_empty(self):
s = Syllable('', '', '')
s = s.append_char('c')

expected = Syllable('c', '', '')
eq_(s, expected)

def test_append_vowel(self):
s = Syllable('c', 'a', '')
s = s.append_char('a')

expected = Syllable('c', 'aa', '')
eq_(s, expected)

def test_append_vowel_empty(self):
s = Syllable('', '', '')
s = s.append_char('a')

expected = Syllable('', 'a', '')
eq_(s, expected)

def test_append_final_consonant(self):
s = Syllable('c', 'a', 'c')
s = s.append_char('c')

expected = Syllable('c', 'a', 'cc')
eq_(s, expected)
Loading