Skip to content

File tree

5 files changed

+82
-57
lines changed

5 files changed

+82
-57
lines changed

.travis.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,17 @@ cache:
1111
directories:
1212
- $HOME/.cache
1313

14+
dist: trusty
15+
1416
env:
1517
global:
16-
- ES_VERSION=2.4.2 ES_DOWNLOAD_URL=https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/${ES_VERSION}/elasticsearch-${ES_VERSION}.tar.gz
18+
- ES_JAVA_OPTS="-Xms512m -Xmx512m"
19+
- ES_VERSION=5.4.0 ES_DOWNLOAD_URL=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
1720
- DATABASE_PORT="54321"
1821
- PROJECT_DIR="$PWD"
1922
- WHEELHOUSE="$HOME/.cache/wheelhouse"
2023
- LIBXML2_DEB="libxml2_2.7.8.dfsg-5.1ubuntu4.15_amd64.deb"
21-
- POSTGRES_DEB="postgresql-9.5_9.5.5-1.pgdg12.4+1_amd64.deb"
24+
- POSTGRES_DEB="postgresql-9.5_9.5.1-1.pgdg60+1_amd64.deb"
2225

2326
before_install:
2427
# cache directories

bots/elasticsearch/bot.py

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,7 @@ class ElasticSearchBot(Bot):
6666

6767
EXACT_FIELD = {
6868
'exact': {
69-
'type': 'string',
70-
'index': 'not_analyzed',
69+
'type': 'keyword',
7170
# From Elasticsearch documentation:
7271
# The value for ignore_above is the character count, but Lucene counts bytes.
7372
# If you use UTF-8 text with many non-ASCII characters, you may want to set the limit to 32766 / 3 = 10922 since UTF-8 characters may occupy at most 3 bytes
@@ -79,67 +78,67 @@ class ElasticSearchBot(Bot):
7978
'creativeworks': {
8079
'dynamic': False,
8180
'properties': {
82-
'affiliations': {'type': 'string', 'fields': EXACT_FIELD},
83-
'contributors': {'type': 'string', 'fields': EXACT_FIELD},
81+
'affiliations': {'type': 'text', 'fields': EXACT_FIELD},
82+
'contributors': {'type': 'text', 'fields': EXACT_FIELD},
8483
'date': {'type': 'date', 'format': 'strict_date_optional_time', 'include_in_all': False},
8584
'date_created': {'type': 'date', 'format': 'strict_date_optional_time', 'include_in_all': False},
8685
'date_modified': {'type': 'date', 'format': 'strict_date_optional_time', 'include_in_all': False},
8786
'date_published': {'type': 'date', 'format': 'strict_date_optional_time', 'include_in_all': False},
8887
'date_updated': {'type': 'date', 'format': 'strict_date_optional_time', 'include_in_all': False},
89-
'description': {'type': 'string'},
90-
'funders': {'type': 'string', 'fields': EXACT_FIELD},
91-
'hosts': {'type': 'string', 'fields': EXACT_FIELD},
92-
'id': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
93-
'identifiers': {'type': 'string', 'fields': EXACT_FIELD},
94-
'justification': {'type': 'string', 'include_in_all': False},
95-
'language': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
96-
'publishers': {'type': 'string', 'fields': EXACT_FIELD},
97-
'registration_type': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
88+
'description': {'type': 'text'},
89+
'funders': {'type': 'text', 'fields': EXACT_FIELD},
90+
'hosts': {'type': 'text', 'fields': EXACT_FIELD},
91+
'id': {'type': 'keyword', 'include_in_all': False},
92+
'identifiers': {'type': 'text', 'fields': EXACT_FIELD},
93+
'justification': {'type': 'text', 'include_in_all': False},
94+
'language': {'type': 'keyword', 'include_in_all': False},
95+
'publishers': {'type': 'text', 'fields': EXACT_FIELD},
96+
'registration_type': {'type': 'keyword', 'include_in_all': False},
9897
'retracted': {'type': 'boolean', 'include_in_all': False},
99-
'sources': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
100-
'subjects': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
101-
'tags': {'type': 'string', 'fields': EXACT_FIELD},
102-
'title': {'type': 'string', 'fields': EXACT_FIELD},
103-
'type': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
104-
'types': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
98+
'sources': {'type': 'keyword', 'include_in_all': False},
99+
'subjects': {'type': 'keyword', 'include_in_all': False},
100+
'tags': {'type': 'text', 'fields': EXACT_FIELD},
101+
'title': {'type': 'text', 'fields': EXACT_FIELD},
102+
'type': {'type': 'keyword', 'include_in_all': False},
103+
'types': {'type': 'keyword', 'include_in_all': False},
105104
'withdrawn': {'type': 'boolean', 'include_in_all': False},
106105
'lists': {'type': 'object', 'dynamic': True, 'include_in_all': False},
107106
},
108107
'dynamic_templates': [
109-
{'exact_field_on_lists_strings': {'path_match': 'lists.*', 'match_mapping_type': 'string', 'mapping': {'type': 'string', 'fields': EXACT_FIELD}}},
108+
{'exact_field_on_lists_strings': {'path_match': 'lists.*', 'match_mapping_type': 'string', 'mapping': {'type': 'text', 'fields': EXACT_FIELD}}},
110109
]
111110
},
112111
'agents': {
113112
'dynamic': False,
114113
'properties': {
115-
'id': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
116-
'identifiers': {'type': 'string', 'fields': EXACT_FIELD},
117-
'name': {'type': 'string', 'fields': {**AUTOCOMPLETE_FIELD, **EXACT_FIELD}},
118-
'family_name': {'type': 'string', 'include_in_all': False},
119-
'given_name': {'type': 'string', 'include_in_all': False},
120-
'additional_name': {'type': 'string', 'include_in_all': False},
121-
'suffix': {'type': 'string', 'include_in_all': False},
122-
'location': {'type': 'string', 'include_in_all': False},
123-
'sources': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
124-
'type': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
125-
'types': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
114+
'id': {'type': 'keyword', 'include_in_all': False},
115+
'identifiers': {'type': 'text', 'fields': EXACT_FIELD},
116+
'name': {'type': 'text', 'fields': {**AUTOCOMPLETE_FIELD, **EXACT_FIELD}},
117+
'family_name': {'type': 'text', 'include_in_all': False},
118+
'given_name': {'type': 'text', 'include_in_all': False},
119+
'additional_name': {'type': 'text', 'include_in_all': False},
120+
'suffix': {'type': 'text', 'include_in_all': False},
121+
'location': {'type': 'text', 'include_in_all': False},
122+
'sources': {'type': 'keyword', 'include_in_all': False},
123+
'type': {'type': 'keyword', 'include_in_all': False},
124+
'types': {'type': 'keyword', 'include_in_all': False},
126125
}
127126
},
128127
'sources': {
129128
'dynamic': False,
130129
'properties': {
131-
'id': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
132-
'name': {'type': 'string', 'fields': {**AUTOCOMPLETE_FIELD, **EXACT_FIELD}},
133-
'short_name': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
134-
'type': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
130+
'id': {'type': 'keyword', 'include_in_all': False},
131+
'name': {'type': 'text', 'fields': {**AUTOCOMPLETE_FIELD, **EXACT_FIELD}},
132+
'short_name': {'type': 'keyword', 'include_in_all': False},
133+
'type': {'type': 'keyword', 'include_in_all': False},
135134
}
136135
},
137136
'tags': {
138137
'dynamic': False,
139138
'properties': {
140-
'id': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
141-
'name': {'type': 'string', 'fields': {**AUTOCOMPLETE_FIELD, **EXACT_FIELD}},
142-
'type': {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
139+
'id': {'type': 'keyword', 'include_in_all': False},
140+
'name': {'type': 'text', 'fields': {**AUTOCOMPLETE_FIELD, **EXACT_FIELD}},
141+
'type': {'type': 'keyword', 'include_in_all': False},
143142
}
144143
},
145144
}

docker-compose.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@
2626
version: '2'
2727
services:
2828
elasticsearch:
29-
image: elasticsearch:2
29+
image: elasticsearch:5.4
3030
ports:
3131
- 9200:9200
32+
environment:
33+
ES_JAVA_OPTS: "-Xms512m -Xmx512m"
3234

3335
rabbitmq:
3436
image: rabbitmq:management

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ django[bcrypt]==1.11 # BSD 3 Clause
1919
djangorestframework-jsonapi==2.1.1 # MIT
2020
djangorestframework==3.6.2 # BSD
2121
docopt==0.6.2 # MIT
22-
elasticsearch==2.3.0 # Apache 2.0
22+
elasticsearch==5.4.0 # Apache 2.0
2323
furl==0.4.95 # None
2424
gevent==1.1.1 # MIT
2525
graphene==1.4 # MIT

tests/bots/test_elastic.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,42 @@ def test_source_soft_deleted(self, elastic):
5656
assert doc['_source']['title'] == x.title
5757
assert doc['_source']['sources'] == []
5858

59+
def test_51_identifiers_rejected(self, elastic):
60+
work1 = factories.AbstractCreativeWorkFactory()
61+
work2 = factories.AbstractCreativeWorkFactory()
62+
for i in range(50):
63+
factories.WorkIdentifierFactory(uri='http://example.com/{}'.format(i), creative_work=work1)
64+
factories.WorkIdentifierFactory(uri='http://example.com/{}/{}'.format(i, i), creative_work=work2)
65+
factories.WorkIdentifierFactory(creative_work=work2)
66+
67+
tasks.IndexModelTask().apply((1, elastic.config.label, 'creativework', [work1.id, work2.id]))
68+
69+
elastic.es_client.get(index=elastic.es_index, doc_type='creativeworks', id=IDObfuscator.encode(work1))
70+
71+
with pytest.raises(NotFoundError):
72+
elastic.es_client.get(index=elastic.es_index, doc_type='creativeworks', id=IDObfuscator.encode(work2))
73+
74+
def test_aggregation(self, elastic):
75+
work = factories.AbstractCreativeWorkFactory()
76+
77+
sources = [factories.SourceFactory() for _ in range(4)]
78+
work.sources.add(*[s.user for s in sources])
79+
80+
tasks.IndexModelTask().apply((1, elastic.config.label, 'creativework', [work.id]))
81+
82+
elastic.es_client.indices.refresh(index=elastic.es_index)
83+
84+
resp = elastic.es_client.search(index=elastic.es_index, doc_type='creativeworks', body={
85+
'size': 0,
86+
'aggregations': {
87+
'sources': {
88+
'terms': {'field': 'sources', 'size': 500}
89+
}
90+
}
91+
})
92+
93+
assert sorted(resp['aggregations']['sources']['buckets'], key=lambda x: x['key']) == sorted([{'key': source.long_title, 'doc_count': 1} for source in sources], key=lambda x: x['key'])
94+
5995

6096
@pytest.mark.django_db
6197
class TestIndexSource:
@@ -91,21 +127,6 @@ def test_index_no_icon(self, elastic):
91127
with pytest.raises(NotFoundError):
92128
elastic.es_client.get(index=elastic.es_index, doc_type='sources', id=source.name)
93129

94-
def test_51_identifiers_rejected(self, elastic):
95-
work1 = factories.AbstractCreativeWorkFactory()
96-
work2 = factories.AbstractCreativeWorkFactory()
97-
for i in range(50):
98-
factories.WorkIdentifierFactory(uri='http://example.com/{}'.format(i), creative_work=work1)
99-
factories.WorkIdentifierFactory(uri='http://example.com/{}/{}'.format(i, i), creative_work=work2)
100-
factories.WorkIdentifierFactory(creative_work=work2)
101-
102-
tasks.IndexModelTask().apply((1, elastic.config.label, 'creativework', [work1.id, work2.id]))
103-
104-
elastic.es_client.get(index=elastic.es_index, doc_type='creativeworks', id=IDObfuscator.encode(work1))
105-
106-
with pytest.raises(NotFoundError):
107-
elastic.es_client.get(index=elastic.es_index, doc_type='creativeworks', id=IDObfuscator.encode(work2))
108-
109130

110131
@pytest.mark.django_db
111132
class TestJanitorTask:

0 commit comments

Comments
 (0)