Skip to content

Commit df3810f

Browse files
authored
Ensure bots propagate through concat (#156)
1 parent abff6af commit df3810f

File tree

4 files changed

+115
-10
lines changed

4 files changed

+115
-10
lines changed

github_activity/github_activity.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def get_activity(
134134
-------
135135
query_data : pandas DataFrame
136136
A munged collection of data returned from your query. This
137-
will be a combination of issues and PRs.
137+
will be a combination of issues and PRs. The DataFrame has a
138+
`bot_users` attribute containing the set of detected bot usernames.
138139
"""
139140

140141
org, repo = _parse_target(target)
@@ -206,13 +207,16 @@ def get_activity(
206207
# Query for both opened and closed issues/PRs in this window
207208
print(f"Running search query:\n{search_query}\n\n", file=sys.stderr)
208209
query_data = []
210+
all_bot_users = set()
209211
for activity_type in ["created", "closed"]:
210212
ii_search_query = (
211213
search_query + f" {activity_type}:{since_dt_str}..{until_dt_str}"
212214
)
213215
qu = GitHubGraphQlQuery(ii_search_query, auth=auth)
214216
qu.request()
215217
query_data.append(qu.data)
218+
# Collect bot users from each query
219+
all_bot_users.update(qu.data.attrs.get("bot_users", set()))
216220

217221
query_data = (
218222
pd.concat(query_data).drop_duplicates(subset=["id"]).reset_index(drop=True)
@@ -223,9 +227,12 @@ def get_activity(
223227
query_data.until_dt_str = until_dt_str
224228
query_data.since_is_git_ref = since_is_git_ref
225229
query_data.until_is_git_ref = until_is_git_ref
230+
# Restore bot_users in attrs (lost during concat)
231+
query_data.attrs["bot_users"] = all_bot_users
226232

227233
if cache:
228234
_cache_data(query_data, cache)
235+
229236
return query_data
230237

231238

@@ -462,15 +469,34 @@ def generate_activity_md(
462469
data["contributors"] = [[]] * len(data)
463470

464471
# Get bot users from GraphQL data (stored in DataFrame attrs)
465-
bot_users = data.attrs.get("bot_users", set())
472+
bot_users = data.attrs["bot_users"]
466473

467474
def ignored_user(username):
468-
if username in bot_users:
475+
if not username:
476+
return False
477+
478+
# First check against GraphQL-detected bot users
479+
# It is common for a bot to have `username` in GitHub and `username[bot]` in commits.
480+
# So this accounts for that.
481+
normalized_username = username.replace("[bot]", "")
482+
if normalized_username in bot_users:
483+
return True
484+
485+
# Next use pattern-based fallback for bots not detected by GraphQL
486+
username_lower = username.lower()
487+
bot_patterns = [
488+
"[bot]", # e.g., github-actions[bot], codecov[bot]
489+
"-bot", # e.g., renovate-bot, release-bot, dependabot
490+
]
491+
if any(pattern in username_lower for pattern in bot_patterns):
469492
return True
493+
494+
# Check against user-specified ignored contributors
470495
if ignored_contributors and any(
471496
fnmatch.fnmatch(username, user) for user in ignored_contributors
472497
):
473498
return True
499+
474500
return False
475501

476502
def filter_ignored(userlist):
@@ -490,12 +516,19 @@ def filter_ignored(userlist):
490516
# - merger
491517
# - reviewers
492518

493-
item_contributors.author = row.author
519+
# Only add author if they're not a bot
520+
if not ignored_user(row.author):
521+
item_contributors.author = row.author
494522

495523
if row.kind == "pr":
496524
for committer in filter_ignored(row.committers):
497525
item_contributors.add(committer)
498-
if row.mergedBy and row.mergedBy != row.author:
526+
# Only add merger if they're not a bot and not the author
527+
if (
528+
row.mergedBy
529+
and row.mergedBy != row.author
530+
and not ignored_user(row.mergedBy)
531+
):
499532
item_contributors.add(row.mergedBy)
500533
for reviewer in filter_ignored(row.reviewers):
501534
item_contributors.add(reviewer)

github_activity/graphql.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,15 @@
4747
committer {
4848
user {
4949
login
50+
__typename
5051
}
5152
}
5253
authors(first: 10) {
5354
edges {
5455
node {
5556
user {
5657
login
58+
__typename
5759
}
5860
}
5961
}
@@ -140,6 +142,7 @@ def __init__(self, query, display_progress=True, auth=None):
140142
variable `GITHUB_ACCESS_TOKEN` will be tried.
141143
"""
142144
self.query = query
145+
self.bot_users = set() # Store detected bot usernames
143146

144147
# Authentication
145148
token = auth or os.environ.get("GITHUB_ACCESS_TOKEN")
@@ -149,7 +152,7 @@ def __init__(self, query, display_progress=True, auth=None):
149152
"--auth flag or must be used to pass a Personal Access Token "
150153
"needed by the GitHub API. You can generate a token at "
151154
"https://github.com/settings/tokens/new. Note that while "
152-
"working with a public repository, you dont need to set any "
155+
"working with a public repository, you don't need to set any "
153156
"scopes on the token you create."
154157
)
155158
self.auth = TokenAuth(token)
@@ -240,9 +243,7 @@ def request(self, n_pages=100, n_per_page=50):
240243
# Extract bot users from raw data before DataFrame conversion
241244
def is_bot(user_dict):
242245
"""Check if a GraphQL user object represents a bot account."""
243-
if not user_dict:
244-
return False
245-
return user_dict.get("__typename") == "Bot"
246+
return user_dict and user_dict.get("__typename") == "Bot"
246247

247248
bot_users = set()
248249
for item in self.issues_and_or_prs:
@@ -272,9 +273,26 @@ def is_bot(user_dict):
272273
if is_bot(comment_author):
273274
bot_users.add(comment_author["login"])
274275

276+
# Check commit authors and committers
277+
commits = item.get("commits")
278+
if commits:
279+
for commit_edge in commits.get("edges", []):
280+
commit = commit_edge["node"]["commit"]
281+
# Check committer
282+
committer = commit.get("committer")
283+
if committer and committer.get("user"):
284+
if is_bot(committer["user"]):
285+
bot_users.add(committer["user"]["login"])
286+
# Check authors
287+
authors = commit.get("authors")
288+
if authors:
289+
for author_edge in authors.get("edges", []):
290+
author_user = author_edge["node"].get("user")
291+
if author_user and is_bot(author_user):
292+
bot_users.add(author_user["login"])
293+
275294
# Create a dataframe of the issues and/or PRs
276295
self.data = pd.DataFrame(self.issues_and_or_prs)
277-
# Store bot users in DataFrame metadata (attrs dict)
278296
self.data.attrs["bot_users"] = bot_users
279297

280298
# Add some extra fields

tests/test_cli.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,34 @@ def test_contributor_sorting(tmpdir, file_regression):
141141
run(cmd.split(), check=True)
142142
md = path_output.read_text()
143143
file_regression.check(md, extension=".md")
144+
145+
146+
@mark.integration
147+
def test_bot_filtering(file_regression):
148+
"""Test that bot users are detected and filtered from output."""
149+
from github_activity.github_activity import get_activity, generate_activity_md
150+
151+
# Use jupyter-book/mystmd because it's a small release, and know theres bot activity
152+
data = get_activity(
153+
target="jupyter-book/mystmd",
154+
since="mystmd@1.6.5",
155+
until="mystmd@1.6.6",
156+
)
157+
158+
# Verify bot_users attrs exists and was preserved (catches the concat bug)
159+
assert "bot_users" in data.attrs, "bot_users should be in DataFrame attrs"
160+
161+
# Verify we actually detected some bots
162+
assert len(data.attrs["bot_users"]) > 0, (
163+
"Should have detected bot users in this release"
164+
)
165+
166+
# Generate markdown and save as regression baseline
167+
md = generate_activity_md(
168+
target="jupyter-book/mystmd",
169+
since="mystmd@1.6.5",
170+
until="mystmd@1.6.6",
171+
)
172+
173+
# Use this regression test to make sure no bots are in the output
174+
file_regression.check(md, extension=".md")
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# mystmd@1.6.5...mystmd@1.6.6
2+
3+
([full changelog](https://github.com/jupyter-book/mystmd/compare/mystmd@1.6.5...mystmd@1.6.6))
4+
5+
## Bugs fixed
6+
7+
- Fix execution bug: no need for kernelspec if no executable content [#2454](https://github.com/jupyter-book/mystmd/pull/2454) ([@choldgraf](https://github.com/choldgraf), [@stefanv](https://github.com/stefanv))
8+
9+
## Other merged PRs
10+
11+
- 🚀 Release [#2457](https://github.com/jupyter-book/mystmd/pull/2457) ([@stefanv](https://github.com/stefanv))
12+
- Pull in latest myst-execute [#2456](https://github.com/jupyter-book/mystmd/pull/2456) ([@stefanv](https://github.com/stefanv))
13+
- 🚀 Release [#2455](https://github.com/jupyter-book/mystmd/pull/2455) ([@stefanv](https://github.com/stefanv))
14+
- 🚀 Release [#2416](https://github.com/jupyter-book/mystmd/pull/2416) ([@bsipocz](https://github.com/bsipocz), [@choldgraf](https://github.com/choldgraf), [@stefanv](https://github.com/stefanv))
15+
16+
## Contributors to this release
17+
18+
The following people contributed discussions, new ideas, code and documentation contributions, and review.
19+
See [our definition of contributors](https://github-activity.readthedocs.io/en/latest/#how-does-this-tool-define-contributions-in-the-reports).
20+
21+
([GitHub contributors page for this release](https://github.com/jupyter-book/mystmd/graphs/contributors?from=2025-11-18&to=2025-11-19&type=c))
22+
23+
@bsipocz ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Absipocz+updated%3A2025-11-18..2025-11-19&type=Issues)) | @choldgraf ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Acholdgraf+updated%3A2025-11-18..2025-11-19&type=Issues)) | @jukent ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Ajukent+updated%3A2025-11-18..2025-11-19&type=Issues)) | @stefanv ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Astefanv+updated%3A2025-11-18..2025-11-19&type=Issues))

0 commit comments

Comments
 (0)