Skip to content

Commit 3f11426

Browse files
committed
Merge branch 'olzama-dev' into main
2 parents b72cd0c + 3d5dfb1 commit 3f11426

13 files changed

+5990
-132
lines changed

freeling/freeling_API/__init__.py

Whitespace-only changes.

freeling/freeling_API/_pyfreeling.so

7.34 MB
Binary file not shown.

freeling/freeling_API/pyfreeling.py

Lines changed: 5706 additions & 0 deletions
Large diffs are not rendered by default.

freeling/freeling_API/sample.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#! /usr/bin/python3
2+
3+
import pyfreeling
4+
import sys, os
5+
6+
## ----------------------------------------------
7+
## ------------- MAIN PROGRAM ---------------
8+
## ----------------------------------------------
9+
10+
os.environ["FREELINGDIR"] = '/usr'
11+
12+
if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") :
13+
print("Folder",os.environ["FREELINGDIR"]+"/share/freeling",
14+
"not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory",
15+
file=sys.stderr)
16+
sys.exit(1)
17+
18+
19+
# Location of FreeLing configuration files.
20+
DATA = os.environ["FREELINGDIR"]+"/share/freeling/"
21+
# Init locales
22+
pyfreeling.util_init_locale("default")
23+
# create language detector. Used just to show it. Results are printed
24+
# but ignored (after, it is assumed language is LANG)
25+
la=pyfreeling.lang_ident(DATA+"common/lang_ident/ident-few.dat")
26+
# create options set for maco analyzer. Default values are Ok, except for data files.
27+
LANG="es"
28+
op= pyfreeling.maco_options(LANG)
29+
op.set_data_files( "",
30+
DATA + "common/punct.dat",
31+
DATA + LANG + "/dicc.src",
32+
DATA + LANG + "/afixos.dat",
33+
"",
34+
DATA + LANG + "/locucions.dat",
35+
DATA + LANG + "/np.dat",
36+
DATA + LANG + "/quantities.dat",
37+
DATA + LANG + "/probabilitats.dat")
38+
39+
# create analyzers
40+
tk=pyfreeling.tokenizer(DATA+LANG+"/tokenizer.dat")
41+
sp=pyfreeling.splitter(DATA+LANG+"/splitter.dat")
42+
mf=pyfreeling.maco(op)
43+
44+
# activate mmorpho odules to be used in next call
45+
mf.set_active_options(False, True, True, True, # select which among created
46+
True, True, False, True, # submodules are to be used.
47+
True, True, True, True ) # default: all created submodules are used
48+
49+
# create tagger, sense anotator, and parsers
50+
tg=pyfreeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2)
51+
#sen=pyfreeling.senses(DATA+LANG+"/senses.dat");
52+
#parser= pyfreeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");
53+
#dep=pyfreeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol());
54+
55+
sid=sp.open_session()
56+
# process input text
57+
#lin=sys.stdin.readline();
58+
lin = "El perro del pueblo duerme."
59+
60+
l = tk.tokenize(lin)
61+
ls = sp.split(sid,l,False)
62+
63+
ls = mf.analyze(ls)
64+
ls = tg.analyze(ls)
65+
66+
## output results
67+
for s in ls :
68+
print(s)
69+
ws = s.get_words()
70+
for w in ws :
71+
print("FORM: {} LEMMA: {} START: {} END: {}".format(w.get_form(), w.get_lemma(),
72+
w.get_span_start(), w.get_span_finish()))
73+
analyses = list(w.get_analysis())
74+
for a_i in analyses:
75+
print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
76+
77+
# clean up
78+
sp.close_session(sid)
79+
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#! /usr/bin/python3
2+
3+
from freeling.freeling_API import pyfreeling
4+
import sys, os, string
5+
6+
class Freeling_tok_tagger:
7+
def __init__(self):
8+
os.environ["FREELINGDIR"] = '/usr'
9+
if not os.path.exists(os.environ["FREELINGDIR"]+"/share/freeling") :
10+
print("Folder",os.environ["FREELINGDIR"]+"/share/freeling",
11+
"not found.\nPlease set FREELINGDIR environment variable to FreeLing installation directory",
12+
file=sys.stderr)
13+
sys.exit(1)
14+
15+
# Location of FreeLing configuration files.
16+
self.DATA = os.environ["FREELINGDIR"]+"/share/freeling/"
17+
# Init locales
18+
pyfreeling.util_init_locale("default")
19+
# create language detector. Used just to show it. Results are printed
20+
# but ignored (after, it is assumed language is LANG)
21+
self.la=pyfreeling.lang_ident(self.DATA+"common/lang_ident/ident-few.dat")
22+
# create options set for maco analyzer. Default values are Ok, except for data files.
23+
self.LANG="es"
24+
self.op= pyfreeling.maco_options(self.LANG)
25+
self.op.set_data_files( "",
26+
self.DATA + "common/punct.dat",
27+
self.DATA + self.LANG + "/dicc.src",
28+
self.DATA + self.LANG + "/afixos.dat",
29+
"",
30+
self.DATA + self.LANG + "/locucions.dat",
31+
self.DATA + self.LANG + "/np.dat",
32+
self.DATA + self.LANG + "/quantities.dat",
33+
self.DATA + self.LANG + "/probabilitats.dat")
34+
35+
# create analyzers
36+
self.tk=pyfreeling.tokenizer(self.DATA+self.LANG+"/tokenizer.dat")
37+
self.sp=pyfreeling.splitter(self.DATA+self.LANG+"/splitter.dat")
38+
self.mf=pyfreeling.maco(self.op)
39+
40+
# activate mmorpho odules to be used in next call
41+
self.mf.set_active_options(False, True, True, True, # select which among created
42+
True, True, False, True, # submodules are to be used.
43+
True, True, True, True ) # default: all created submodules are used
44+
45+
self.tg=pyfreeling.hmm_tagger(self.DATA+self.LANG+"/tagger.dat",True,2)
46+
47+
def tokenize_and_tag(self, sentence_list):
48+
output = []
49+
sid=self.sp.open_session()
50+
# process input text
51+
#lin = "El perro del pueblo duerme."
52+
for i,lin in enumerate(sentence_list):
53+
if not lin[-1] in string.punctuation:
54+
# assume a dot at the end
55+
lin = lin + '.'
56+
output.append({'sentence': lin, 'tokens':[]})
57+
s = self.tk.tokenize(lin)
58+
s = self.sp.split(sid,s,False)
59+
s = self.mf.analyze(s)
60+
s = self.tg.analyze(s)
61+
assert len(s) == 1
62+
s = s[0]
63+
ws = s.get_words()
64+
for j,w in enumerate(ws) :
65+
output[i]['tokens'].append({'lemma':w.get_lemma(), 'form': w.get_form(),
66+
'start':w.get_span_start(), 'end': w.get_span_finish(), 'tags': []})
67+
analyses = list(w.get_analysis())
68+
for a in analyses:
69+
#print("\ttag: {}, prob: {}".format(a_i.get_tag(), a_i.get_prob()))
70+
output[i]['tokens'][j]['tags'].append({'tag': a.get_tag(), 'prob': a.get_prob()})
71+
# clean up
72+
self.sp.close_session(sid)
73+
return output
74+

letypes.tdl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12370,6 +12370,11 @@ n_-_pn_native_le := n_-_pn_lex & native_le
1237012370
This is a native lexical entry type, for words that are in the lexicon.
1237112371
""".
1237212372

12373+
foreign_le := n_-_pn_lex
12374+
"""
12375+
Assume for now that it is useful to treat foreign words/fragments as named entities.
12376+
""".
12377+
1237312378
n_-_pn_le := n_-_pn_lex.
1237412379

1237512380
; <type val="n_-_pr-pers-n_le">

srtypes.tdl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -881,7 +881,7 @@ basic-head-adj-phrase := basic-head-mod-phrase-simple & phrasal &
881881
head-adj-phrase := basic-head-adj-phrase & head-initial.
882882
non_str-head-adj-phrase := basic-head-adj-phrase & non_str-head-initial.
883883

884-
; We split head-adj-phrase and adj-head-phrase into two each, bone for intersective
884+
; We split head-adj-phrase and adj-head-phrase into two each, one for intersective
885885
; modifiers and one for scopal modifiers, in order to get desired results for recursive
886886
; modification as in "apparently difficult problem" (cf. Kasper '98). This split is also
887887
; used in generation, where we delay construction of intersective modification, but not scopal.
@@ -5117,4 +5117,4 @@ r_p_crd-mono-mid_constr := r_p_crd-mono-mid_phrase & binary-rule-right-to-left.
51175117
p_r_crd-mono-top_constr := p_r_crd-mono-top_phrase & binary-rule-right-to-left.
51185118
p_r_crd-mono-mid_constr := p_r_crd-mono-mid_phrase & binary-rule-right-to-left.
51195119
a_r_crd-mono-top_constr := a_r_crd-mono-top_phrase & binary-rule-right-to-left.
5120-
a_r_crd-mono-mid_constr := a_r_crd-mono-mid_phrase & binary-rule-right-to-left.
5120+
a_r_crd-mono-mid_constr := a_r_crd-mono-mid_phrase & binary-rule-right-to-left.

tsdb/mrs.txt

Lines changed: 0 additions & 107 deletions
This file was deleted.

util/override_freeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# with the probability 78%.
77
#LEMMA_TAG_PAIRS = {'NCFS000' : {'ladra': {'prob': 0.80, 'replace': 'VMIP3S0'}}}
88

9-
REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}}
9+
REPLACE_LEMMA_AND_TAG = {'ladra': {'lemma': 'ladrar', 'tag':'VMIP3S0'}, 'dió': {'lemma': 'dar', 'tag': 'VMIS3S0'}}
1010

1111
DO_NOT_OVERRIDE = {'uf', 'je', 'ja'}
1212

0 commit comments

Comments
 (0)