Dependency Updates (#37)

yasufumy · web-flow · commit fbbdd9c0a83d · 2023-07-16T20:30:32.000+09:00
* Update pytorch-partial-tagger

* Align new interfaces.

* Bump version

* Update requirements.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ requires-python = ">=3.8"
 
 [tool.poetry]
 name = "spacy-partial-tagger"
-version = "0.15.1"
+version = "0.15.2"
 description = "Sequence Tagger for Partially Annotated Dataset in spaCy"
 authors = ["yasufumi <yasufumi.taniguchi@gmail.com>"]
 license = "MIT"
@@ -27,7 +27,7 @@ transformers = {extras = ["ja"], version = "^4.25.1"}
 torch = "^2.0.1"
 spacy = {extras = ["transformers"], version = "^3.3.1"}
 spacy-alignments = "^0.8.5"
-pytorch-partial-tagger = "^0.1.9"
+pytorch-partial-tagger = "^0.1.12"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.3.0"
diff --git a/requirements.txt b/requirements.txt
@@ -2,18 +2,18 @@ black==22.12.0 ; python_version >= "3.8" and python_version < "4.0"
 blis==0.7.9 ; python_version >= "3.8" and python_version < "4.0"
 catalogue==2.0.8 ; python_version >= "3.8" and python_version < "4.0"
 certifi==2023.5.7 ; python_version >= "3.8" and python_version < "4.0"
-charset-normalizer==3.1.0 ; python_version >= "3.8" and python_version < "4.0"
-click==8.1.3 ; python_version >= "3.8" and python_version < "4.0"
+charset-normalizer==3.2.0 ; python_version >= "3.8" and python_version < "4.0"
+click==8.1.5 ; python_version >= "3.8" and python_version < "4.0"
 colorama==0.4.6 ; python_version >= "3.8" and python_version < "4.0" and sys_platform == "win32" or python_version >= "3.8" and python_version < "4.0" and platform_system == "Windows"
-confection==0.0.4 ; python_version >= "3.8" and python_version < "4.0"
+confection==0.1.0 ; python_version >= "3.8" and python_version < "4.0"
 coverage[toml]==7.2.7 ; python_version >= "3.8" and python_version < "4.0"
 cymem==2.0.7 ; python_version >= "3.8" and python_version < "4.0"
-exceptiongroup==1.1.1 ; python_version >= "3.8" and python_version < "3.11"
+exceptiongroup==1.1.2 ; python_version >= "3.8" and python_version < "3.11"
 filelock==3.12.2 ; python_version >= "3.8" and python_version < "4.0"
 flake8==4.0.1 ; python_version >= "3.8" and python_version < "4.0"
 fsspec==2023.6.0 ; python_version >= "3.8" and python_version < "4.0"
 fugashi==1.2.1 ; python_version >= "3.8" and python_version < "4.0"
-huggingface-hub==0.15.1 ; python_version >= "3.8" and python_version < "4.0"
+huggingface-hub==0.16.4 ; python_version >= "3.8" and python_version < "4.0"
 idna==3.4 ; python_version >= "3.8" and python_version < "4.0"
 iniconfig==2.0.0 ; python_version >= "3.8" and python_version < "4.0"
 ipadic==1.0.0 ; python_version >= "3.8" and python_version < "4.0"
@@ -25,36 +25,36 @@ mccabe==0.6.1 ; python_version >= "3.8" and python_version < "4.0"
 mpmath==1.3.0 ; python_version >= "3.8" and python_version < "4.0"
 murmurhash==1.0.9 ; python_version >= "3.8" and python_version < "4.0"
 mypy-extensions==1.0.0 ; python_version >= "3.8" and python_version < "4.0"
-mypy==1.3.0 ; python_version >= "3.8" and python_version < "4.0"
+mypy==1.4.1 ; python_version >= "3.8" and python_version < "4.0"
 networkx==3.1 ; python_version >= "3.8" and python_version < "4.0"
-numpy==1.24.3 ; python_version >= "3.8" and python_version < "4.0"
+numpy==1.24.4 ; python_version >= "3.8" and python_version < "4.0"
 packaging==23.1 ; python_version >= "3.8" and python_version < "4.0"
 pathspec==0.11.1 ; python_version >= "3.8" and python_version < "4.0"
-pathy==0.10.1 ; python_version >= "3.8" and python_version < "4.0"
+pathy==0.10.2 ; python_version >= "3.8" and python_version < "4.0"
 plac==1.3.5 ; python_version >= "3.8" and python_version < "4.0"
-platformdirs==3.6.0 ; python_version >= "3.8" and python_version < "4.0"
-pluggy==1.0.0 ; python_version >= "3.8" and python_version < "4.0"
+platformdirs==3.9.1 ; python_version >= "3.8" and python_version < "4.0"
+pluggy==1.2.0 ; python_version >= "3.8" and python_version < "4.0"
 preshed==3.0.8 ; python_version >= "3.8" and python_version < "4.0"
 pycodestyle==2.8.0 ; python_version >= "3.8" and python_version < "4.0"
-pydantic==1.10.9 ; python_version >= "3.8" and python_version < "4.0"
+pydantic==1.10.11 ; python_version >= "3.8" and python_version < "4.0"
 pyflakes==2.4.0 ; python_version >= "3.8" and python_version < "4.0"
 pytest-cov==3.0.0 ; python_version >= "3.8" and python_version < "4.0"
-pytest==7.3.2 ; python_version >= "3.8" and python_version < "4.0"
-pytorch-partial-tagger==0.1.9 ; python_version >= "3.8" and python_version < "4.0"
+pytest==7.4.0 ; python_version >= "3.8" and python_version < "4.0"
+pytorch-partial-tagger==0.1.12 ; python_version >= "3.8" and python_version < "4.0"
 pyyaml==6.0 ; python_version >= "3.8" and python_version < "4.0"
 regex==2023.6.3 ; python_version >= "3.8" and python_version < "4.0"
 requests==2.31.0 ; python_version >= "3.8" and python_version < "4.0"
 rhoknp==1.3.0 ; python_version >= "3.8" and python_version < "4.0"
 ruff==0.0.270 ; python_version >= "3.8" and python_version < "4.0"
 safetensors==0.3.1 ; python_version >= "3.8" and python_version < "4.0"
-setuptools==67.8.0 ; python_version >= "3.8" and python_version < "4.0"
+setuptools==68.0.0 ; python_version >= "3.8" and python_version < "4.0"
 smart-open==6.3.0 ; python_version >= "3.8" and python_version < "4.0"
 spacy-alignments==0.8.6 ; python_version >= "3.8" and python_version < "4.0"
 spacy-legacy==3.0.12 ; python_version >= "3.8" and python_version < "4.0"
 spacy-loggers==1.0.4 ; python_version >= "3.8" and python_version < "4.0"
 spacy-transformers==1.2.5 ; python_version >= "3.8" and python_version < "4.0"
-spacy==3.5.3 ; python_version >= "3.8" and python_version < "4.0"
-spacy[transformers]==3.5.3 ; python_version >= "3.8" and python_version < "4.0"
+spacy==3.6.0 ; python_version >= "3.8" and python_version < "4.0"
+spacy[transformers]==3.6.0 ; python_version >= "3.8" and python_version < "4.0"
 srsly==2.4.6 ; python_version >= "3.8" and python_version < "4.0"
 sudachidict-core==20230110 ; python_version >= "3.8" and python_version < "4.0"
 sudachipy==0.6.7 ; python_version >= "3.8" and python_version < "4.0"
@@ -66,8 +66,8 @@ torch==2.0.1 ; python_version >= "3.8" and python_version < "4.0"
 tqdm==4.65.0 ; python_version >= "3.8" and python_version < "4.0"
 transformers==4.30.2 ; python_version >= "3.8" and python_version < "4.0"
 transformers[ja]==4.30.2 ; python_version >= "3.8" and python_version < "4.0"
-typer==0.7.0 ; python_version >= "3.8" and python_version < "4.0"
-typing-extensions==4.6.3 ; python_version >= "3.8" and python_version < "4.0"
+typer==0.9.0 ; python_version >= "3.8" and python_version < "4.0"
+typing-extensions==4.7.1 ; python_version >= "3.8" and python_version < "4.0"
 unidic-lite==1.0.8 ; python_version >= "3.8" and python_version < "4.0"
 unidic==1.1.0 ; python_version >= "3.8" and python_version < "4.0"
 urllib3==2.0.3 ; python_version >= "3.8" and python_version < "4.0"
diff --git a/spacy_partial_tagger/pipeline.py b/spacy_partial_tagger/pipeline.py
@@ -2,9 +2,8 @@
 
 import srsly
 import torch
-from partial_tagger.data import CharBasedTags, LabelSet
+from partial_tagger.data import LabelSet
 from partial_tagger.data.batch.tag import TagsBatch
-from partial_tagger.data.batch.text import create_token_based_tags
 from partial_tagger.training import compute_partially_supervised_loss
 from partial_tagger.utils import create_tag
 from spacy import util
@@ -52,15 +51,15 @@ def set_annotations(
         docs: List[Doc],
         tag_indices: Floats2d,
     ) -> None:
-        tokenized_texts = [doc.user_data["tokenized_text"] for doc in docs]
 
-        tags_batch = create_token_based_tags(
-            tokenized_texts, tag_indices, self.label_set, self.padding_index
-        )
-
-        for doc, tags in zip(docs, tags_batch):
+        for doc, indices in zip(docs, tag_indices.tolist()):
+            indices = [index for index in indices if index != self.padding_index]
+            alignment = doc.user_data["alignment"]
             ents = []
-            for tag in tags:
+            for tag in alignment.create_char_based_tags(
+                tag_indices=indices,
+                label_set=self.label_set,
+            ):
                 span = doc.char_span(tag.start, tag.start + tag.length, tag.label)
                 if span:
                     ents.append(span)
@@ -114,24 +113,26 @@ def get_loss(
     ) -> Tuple[float, Floats4d]:
         scores_pt = xp2torch(scores, requires_grad=True)
 
-        token_based_tags = []
+        char_based_tags = []
+        alignments = []
         lengths = []
         for example in examples:
             tags = tuple(
                 create_tag(ent.start_char, len(ent.text), ent.label_)
                 for ent in example.y.ents
             )
-            tokenized_text = example.x.user_data["tokenized_text"]
-            token_based_tags.append(
-                CharBasedTags(tags, example.x.text).convert_to_token_based(
-                    tokenized_text
-                )
-            )
-            lengths.append(tokenized_text.num_tokens)
+            char_based_tags.append(tags)
 
-        tags_batch = TagsBatch(tuple(token_based_tags), self.label_set)
+            alignment = example.x.user_data["alignment"]
+            lengths.append(alignment.num_tokens)
+            alignments.append(alignment)
+
+        tags_batch = TagsBatch(
+            tags_batch=tuple(char_based_tags),
+            alignments=alignments,
+        )
         tags_batch.to(scores_pt.device)
-        tag_bitmap = tags_batch.get_tag_bitmap()
+        tag_bitmap = tags_batch.get_tag_bitmap(self.label_set)
 
         max_length = max(lengths)
         mask = torch.tensor(
diff --git a/spacy_partial_tagger/tagger.py b/spacy_partial_tagger/tagger.py
@@ -47,8 +47,8 @@ def forward(
 
     text_batch = tokenizer(tuple(doc.text for doc in X))
 
-    for doc, text in zip(X, text_batch.tokenized_texts):
-        doc.user_data["tokenized_text"] = text
+    for doc, alignment in zip(X, text_batch.alignments):
+        doc.user_data["alignment"] = alignment
 
     device = get_torch_default_device()
     text_batch.to(device)
diff --git a/spacy_partial_tagger/tokenizer.py b/spacy_partial_tagger/tokenizer.py
@@ -1,7 +1,6 @@
 from typing import Optional, Tuple
 
-import torch
-from partial_tagger.data import Span, TokenizedText
+from partial_tagger.data import Alignment, Span
 from partial_tagger.data.batch.text import (
     BaseTokenizer,
     TextBatch,
@@ -26,6 +25,7 @@ def __init__(
 
         self.__tokenizer_args = tokenizer_args or {
             "padding": True,
+            "truncation": True,
             "return_tensors": "pt",
         }
         self.__tokenizer_args["return_offsets_mapping"] = True
@@ -34,9 +34,10 @@ def __call__(self, texts: Tuple[str]) -> TextBatch:
         batch_encoding = self.__tokenizer(texts, **self.__tokenizer_args)
 
         pad_token_id = self.__tokenizer.pad_token_id
-        tokenized_text_lengths = (batch_encoding.input_ids != pad_token_id).sum(dim=1)
+        mask = batch_encoding.input_ids != pad_token_id
+        tokenized_text_lengths = mask.sum(dim=1)
 
-        tokenized_texts = []
+        alignments = []
         for _tokenized_text_length, input_ids, text in zip(
             tokenized_text_lengths, batch_encoding.input_ids, texts
         ):
@@ -52,16 +53,11 @@ def __call__(self, texts: Tuple[str]) -> TextBatch:
                 end = char_span.start + char_span.length
                 token_indices[start:end] = [token_index] * char_span.length
 
-            tokenized_texts.append(
-                TokenizedText(text, char_spans, tuple(token_indices))
-            )
+            alignments.append(Alignment(text, char_spans, tuple(token_indices)))
 
-        lengths = [text.num_tokens for text in tokenized_texts]
-        max_length = max(lengths)
-        mask = torch.tensor(
-            [[True] * length + [False] * (max_length - length) for length in lengths]
+        return TextBatch(
+            tagger_inputs=batch_encoding, mask=mask, alignments=tuple(alignments)
         )
-        return TextBatch(tuple(tokenized_texts), batch_encoding, mask)
 
 
 def get_tokenizer(