Skip to content

Commit 9775781

Browse files
authored
Update how git-theta track ${path} works. (#227)
* Update how `git-theta track ${path}` works. Currently, when a pattern in `.gitattributes` matches the path in `git-theta track`, the theta attributes (filter, merge, diff) are added to that line. This can cause issues as it may result in tracking more files with git-theta than expected. However, just adding a new line that is an exact match for the path to the end of the gitattribute file is not correct either. The *last* attribute line in the file is the one used by Git. This could result in `git-theta track` removing an some other attribute that is set for that file. This PR updates the way that git attributes are set. If there are already git attributes set to non-theta values for the file that are used by git-theta (filter, merge, diff) an error is raised. If these attributes are all set to git-theta, no new entry is added, even when the entry is a pattern match instead of an exact match. If the new file has no attributes set, or attributes that don't overlap with git-theta, then a new entry is added. Non-overlapping attributes are copied down if they were set before. When a line has an attribute set multiple times, the *last* one is used so we can add the theta filters at the end to override any previous ones. Added a `is_theta_tracked` function similar to the one from #214 where the test for if a file is tracked by Git-Theta is abstracted into a function. However it is implemented differently as it now handles the subtleties of what attribute line is active at a given time. Only the final line that a path matches is used to set attributes and only the last entry for some key is used. This is now respected. Git attributes support the "unsetting" of attributes (e.g. "-diff"), these attributes are correctly copied around, but they aren't currently logically checked to see if they intersect with git-theta attributes. * bump setup python version
1 parent b4dc503 commit 9775781

File tree

7 files changed

+301
-75
lines changed

7 files changed

+301
-75
lines changed

.github/workflows/end2endtest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
steps:
2525
- uses: actions/checkout@v2
2626
- name: Set Up Python ${{ matrix.python-version }}
27-
uses: actions/setup-python@v2
27+
uses: actions/setup-python@v4
2828
with:
2929
python-version: ${{ matrix.python-version }}
3030
cache: "pip"

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
runs-on: ubuntu-latest
2222
steps:
2323
- uses: actions/checkout@v2
24-
- uses: actions/setup-python@v2
24+
- uses: actions/setup-python@v4
2525
with:
2626
python-version: 3.8
2727
# Install package and deps so third-party packages are sorted

.github/workflows/publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
steps:
1212
- uses: actions/checkout@v2
1313
- name: Setup Python
14-
uses: actions/setup-python@v2
14+
uses: actions/setup-python@v4
1515
with:
1616
python-version: "3.8"
1717
- name: Install Build Package

.github/workflows/unittest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
steps:
2525
- uses: actions/checkout@v2
2626
- name: Set Up Python ${{ matrix.python-version }}
27-
uses: actions/setup-python@v2
27+
uses: actions/setup-python@v4
2828
with:
2929
python-version: ${{ matrix.python-version }}
3030
cache: "pip"

git_theta/git_utils.py

Lines changed: 135 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Utilities for manipulating git."""
22

3+
import copy
4+
import dataclasses
35
import filecmp
46
import fnmatch
57
import io
@@ -10,7 +12,7 @@
1012
import shutil
1113
import subprocess
1214
import sys
13-
from typing import List, Sequence, Union
15+
from typing import Dict, List, Optional, Sequence, Union
1416

1517
import git
1618

@@ -26,6 +28,11 @@
2628

2729
from git_theta import async_utils
2830

31+
# These are the git attributes that git-theta currently uses to manage checked-in
32+
# files. Defined as a variable in case extra functionality ever requires more
33+
# attributes.
34+
THETA_ATTRIBUTES = ("filter", "merge", "diff")
35+
2936

3037
def get_git_repo():
3138
"""
@@ -107,7 +114,26 @@ def get_gitattributes_file(repo):
107114
return os.path.join(repo.working_dir, ".gitattributes")
108115

109116

110-
def read_gitattributes(gitattributes_file):
117+
@dataclasses.dataclass
118+
class GitAttributes:
119+
"""Git attributes for a file that matches pattern."""
120+
121+
pattern: str
122+
attributes: Dict[str, str]
123+
raw: Optional[str] = None
124+
125+
def __str__(self):
126+
if self.raw:
127+
return self.raw
128+
attrs = " ".join(f"{k}={v}" if v else k for k, v in self.attributes.items())
129+
return f"{self.pattern} {attrs}"
130+
131+
def __eq__(self, o):
132+
raw_eq = self.raw == o.raw if self.raw and o.raw else True
133+
return self.pattern == o.pattern and self.attributes == o.attributes and raw_eq
134+
135+
136+
def read_gitattributes(gitattributes_file) -> List[GitAttributes]:
111137
"""
112138
Read contents of this repo's .gitattributes file
113139
@@ -123,14 +149,33 @@ def read_gitattributes(gitattributes_file):
123149
"""
124150
if os.path.exists(gitattributes_file):
125151
with open(gitattributes_file, "r") as f:
126-
return [line.rstrip("\n") for line in f]
152+
return [parse_gitattributes(line.rstrip("\n")) for line in f]
127153
else:
128154
return []
129155

130156

157+
def parse_gitattributes(gitattributes: str) -> GitAttributes:
158+
# TODO: Fix for escaped patterns
159+
pattern, *attributes = gitattributes.split(" ")
160+
attrs = {}
161+
# Overwrite as we go to get the LAST attribute behavior
162+
for attribute in attributes:
163+
if "=" in attribute:
164+
key, value = attribute.split("=")
165+
# TODO: Update to handle unsetting attributes like "-diff". Currently we
166+
# just copy then as keys for printing but don't check their semantics,
167+
# for example a file with an unset diff does currently throw an error
168+
# when adding git-theta tracking.
169+
else:
170+
key = attribute
171+
value = None
172+
attrs[key] = value
173+
return GitAttributes(pattern, attrs, gitattributes)
174+
175+
131176
@file_or_name(gitattributes_file="w")
132177
def write_gitattributes(
133-
gitattributes_file: Union[str, io.FileIO], attributes: List[str]
178+
gitattributes_file: Union[str, io.FileIO], attributes: List[GitAttributes]
134179
):
135180
"""
136181
Write list of attributes to this repo's .gitattributes file
@@ -143,60 +188,112 @@ def write_gitattributes(
143188
attributes:
144189
Attributes to write to .gitattributes
145190
"""
146-
gitattributes_file.write("\n".join(attributes))
191+
gitattributes_file.write("\n".join(map(str, attributes)))
147192
# End file with newline.
148193
gitattributes_file.write("\n")
149194

150195

151-
def add_theta_to_gitattributes(gitattributes: List[str], path: str) -> str:
152-
"""Add a filter=theta that covers file_name.
196+
def add_theta_to_gitattributes(
197+
gitattributes: List[GitAttributes],
198+
path: str,
199+
theta_attributes: Sequence[str] = THETA_ATTRIBUTES,
200+
) -> List[GitAttributes]:
201+
"""Add git attributes required by git-theta for path.
202+
203+
If there is a pattern that covers the current file that applies the git-theta
204+
attributes, no new pattern is added. If there is a pattern that covers the
205+
current file and sets attributes used by git-theta an error is raised. If
206+
there is a pattern that sets non-overlapping attributes they are copied into
207+
a new path-specific pattern. If there is no match, a new path-specific
208+
pattern is always created.
153209
154210
Parameters
155211
----------
156-
gitattributes: A list of the lines from the gitattribute files.
212+
gitattributes: A list of parsed git attribute entries.
157213
path: The path to the model we are adding a filter to.
158214
215+
Raises
216+
------
217+
ValueError
218+
`path` is covered by an active git attributes entry that sets merge,
219+
filter, or diff to a value other than "theta".
220+
159221
Returns
160222
-------
161-
List[str]
162-
The lines to write to the new gitattribute file with a (possibly) new
163-
filter=theta added that covers the given file.
223+
List[GitAttributes]
224+
The git attributes write to the new gitattribute file with a (possibly)
225+
new (filter|merge|diff)=theta added that covers `path`.
164226
"""
165-
pattern_found = False
166-
new_gitattributes = []
167-
for line in gitattributes:
168-
# TODO(bdlester): Revisit this regex to see if it when the pattern
169-
# is escaped due to having spaces in it.
170-
match = re.match(r"^\s*(?P<pattern>[^\s]+)\s+(?P<attributes>.*)$", line)
171-
if match:
172-
# If there is already a pattern that covers the file, add the filter
173-
# to that.
174-
if fnmatch.fnmatchcase(path, match.group("pattern")):
175-
pattern_found = True
176-
if not "filter=theta" in match.group("attributes"):
177-
line = f"{line.rstrip()} filter=theta"
178-
if not "merge=theta" in match.group("attributes"):
179-
line = f"{line.rstrip()} merge=theta"
180-
if not "diff=theta" in match.group("attributes"):
181-
line = f"{line.rstrip()} diff=theta"
182-
new_gitattributes.append(line)
183-
# If we don't find a matching pattern, add a new line that covers just this
184-
# specific file.
185-
if not pattern_found:
186-
new_gitattributes.append(f"{path} filter=theta merge=theta diff=theta")
187-
return new_gitattributes
188-
189-
190-
def get_gitattributes_tracked_patterns(gitattributes_file):
227+
previous_attribute = None
228+
# Find if an active gitattribute entry applies to path
229+
for gitattribute in gitattributes[::-1]:
230+
if fnmatch.fnmatchcase(path, gitattribute.pattern):
231+
previous_attribute = gitattribute
232+
break
233+
# If path is already managed by a git attributes entry.
234+
if previous_attribute:
235+
# If all of the theta attributes are set, we don't do anything.
236+
if all(
237+
previous_attribute.attributes.get(attr) == "theta"
238+
for attr in theta_attributes
239+
):
240+
return gitattributes
241+
# If any of the attributes theta uses is set to something else, error out.
242+
if any(
243+
attr in previous_attribute.attributes
244+
and previous_attribute.attributes[attr] != "theta"
245+
for attr in theta_attributes
246+
):
247+
raise ValueError(
248+
f"Git Attributes used by git-theta are already set for {path}. "
249+
f"Found filter={previous_attribute.attributes.get('filter')}, "
250+
f"diff={previous_attribute.attributes.get('diff')}, "
251+
f"merge={previous_attribute.attributes.get('merge')}."
252+
)
253+
# If the old entry set other attributes, make sure they are preserved.
254+
attributes = (
255+
copy.deepcopy(previous_attribute.attributes) if previous_attribute else {}
256+
)
257+
for attr in theta_attributes:
258+
attributes[attr] = "theta"
259+
new_attribute = GitAttributes(path, attributes)
260+
gitattributes.append(new_attribute)
261+
return gitattributes
262+
263+
264+
def get_gitattributes_tracked_patterns(
265+
gitattributes_file, theta_attributes: Sequence[str] = THETA_ATTRIBUTES
266+
):
191267
gitattributes = read_gitattributes(gitattributes_file)
192268
theta_attributes = [
193-
attribute for attribute in gitattributes if "filter=theta" in attribute
269+
attr
270+
for attr in gitattributes
271+
if attr.attributes.get(a) == "theta"
272+
for a in theta_attributes
194273
]
274+
return [attr.pattern for attr in theta_attributes]
195275
# TODO: Correctly handle patterns with escaped spaces in them
196276
patterns = [attribute.split(" ")[0] for attribute in theta_attributes]
197277
return patterns
198278

199279

280+
def is_theta_tracked(
281+
path: str,
282+
gitattributes: List[GitAttributes],
283+
theta_attributes: Sequence[str] = THETA_ATTRIBUTES,
284+
) -> bool:
285+
"""Check if `path` is tracked by git-theta based on `.gitattributes`.
286+
287+
Note: The last line that matches in .gitattributes is the active one so
288+
start from the end. If the first match (really last) does not have the
289+
theta filter active then the file is not tracked by Git-Theta.
290+
"""
291+
for attr in gitattributes[::-1]:
292+
if fnmatch.fnmatchcase(path, attr.pattern):
293+
return all(attr.attributes.get(a) == "theta" for a in theta_attributes)
294+
return False
295+
296+
200297
def add_file(f, repo):
201298
"""
202299
Add file to git staging area

git_theta/scripts/git_theta.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,12 @@ def post_commit(args):
8080
theta_commits = theta.ThetaCommits(repo)
8181

8282
gitattributes_file = git_utils.get_gitattributes_file(repo)
83-
patterns = git_utils.get_gitattributes_tracked_patterns(gitattributes_file)
83+
gitattributes = git_utils.read_gitattributes(gitattributes_file)
8484

8585
oids = set()
8686
commit = repo.commit("HEAD")
8787
for path in commit.stats.files.keys():
88-
if any([fnmatch.fnmatchcase(path, pattern) for pattern in patterns]):
88+
if git_utils.is_theta_tracked(path, gitattributes):
8989
curr_metadata = metadata.Metadata.from_file(commit.tree[path].data_stream)
9090
prev_metadata = metadata.Metadata.from_commit(repo, path, "HEAD~1")
9191

0 commit comments

Comments
 (0)