diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 00000000..b2316674 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within pyproject.toml +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 diff --git a/brainscore_language/metrics/cka/metric.py b/brainscore_language/metrics/cka/metric.py index 49f97cf0..c5f72152 100644 --- a/brainscore_language/metrics/cka/metric.py +++ b/brainscore_language/metrics/cka/metric.py @@ -19,7 +19,7 @@ def centering(K): return np.dot(np.dot(H, K), H) # HKH are the same with KH, KH is the first centering, H(KH) do the second time, - # results are the sme with one time centering + # results are the same with one time centering # return np.dot(H, K) # KH diff --git a/brainscore_language/model_helpers/container.py b/brainscore_language/model_helpers/container.py index 896bec9d..5eba0105 100644 --- a/brainscore_language/model_helpers/container.py +++ b/brainscore_language/model_helpers/container.py @@ -24,7 +24,7 @@ class ContainerSubject(ArtificialSubject): """ - Evaluation interface for arbitary containerized models. + Evaluation interface for arbitrary containerized models. User must install either 'Singularity' or 'Docker' to evaluate container models. To add new model, build a container with an entry point that supports the following interface: @@ -73,7 +73,7 @@ def __init__( """ :param container: Container name, e.g., "USERNAME/CONTAINER:TAG" :param entrypoint: Entrypoint to run inside container, e.g., "python /path/to/entrypoint.py" - :param identifier: Model identifer passed to entrypoint, e.g., "model_name" + :param identifier: Model identifier passed to entrypoint, e.g., "model_name" :param region_layer_mapping: Mapping from brain region to requested measure, e.g., {"language_system": "model_layer_name"} :param task_heads: Mapping from task to callable that takes the output of the container and returns a score, e.g., {ArtificialSubject.Task.next_word: predict_next_word_function} """ diff --git a/brainscore_language/model_helpers/modeling_suma.py b/brainscore_language/model_helpers/modeling_suma.py index b4d702a4..07697e3a 100644 --- a/brainscore_language/model_helpers/modeling_suma.py +++ b/brainscore_language/model_helpers/modeling_suma.py @@ -1124,7 +1124,7 @@ def prepare_inputs_for_generation( # Keep only the unprocessed tokens: # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as # input) if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] diff --git a/brainscore_language/models/earley_parser/parser.py b/brainscore_language/models/earley_parser/parser.py index 8c306500..35710960 100644 --- a/brainscore_language/models/earley_parser/parser.py +++ b/brainscore_language/models/earley_parser/parser.py @@ -221,7 +221,7 @@ def create_grammar( :param treebank_path: a path to a treebank corpus :param grammar_string: one or more file names to be parsed in the grammar. If None, all files will be parsed :param unk_low_frequency: if True, replaces all words that appear less than k times by - :param k: the replacement threshold (min number of occurances for a word to NOT be replaced by ) + :param k: the replacement threshold (min number of occurrences for a word to NOT be replaced by ) """ # Load PTB annotations @@ -230,7 +230,7 @@ def create_grammar( r".*", ) - # First, get all productions and count the occurances of each lexical in all productions + # First, get all productions and count the occurrences of each lexical in all productions productions = [] lexical_counts = {} for tree in treebank.parsed_sents(fileids): diff --git a/brainscore_language/models/earley_parser/utils.py b/brainscore_language/models/earley_parser/utils.py index 178f5289..100973e2 100644 --- a/brainscore_language/models/earley_parser/utils.py +++ b/brainscore_language/models/earley_parser/utils.py @@ -1,6 +1,6 @@ """ Modified rule definitions for the NLTK abstract chart rules to work with a probabilistic context-free grammar. -Added a probabilstic Earley chart parser by applying incremental chart parsing with the probabilistic rules. +Added a probabilistic Earley chart parser by applying incremental chart parsing with the probabilistic rules. Adapted from: https://www.nltk.org/api/nltk.parse.chart.html """ diff --git a/pyproject.toml b/pyproject.toml index 217638d0..405f672e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,3 +75,10 @@ markers = [ "brainscore_language.data" = ["**"] "brainscore_language.metrics" = ["**"] "brainscore_language.models" = ["**"] + +[tool.codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = '.git*,*.csv,*.json,data' +check-hidden = true +ignore-regex = '^\s*"image/\S+": ".*' +# ignore-words-list = ''