From c63e8d5d8a68dd5637fb2f9d01d162298270717a Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Wed, 5 Nov 2025 17:22:27 +0300 Subject: [PATCH 01/14] Added wikicommons --- scripts/1-fetch/wikicommons_fetch.py | 254 +++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 scripts/1-fetch/wikicommons_fetch.py diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py new file mode 100644 index 00000000..45a875f1 --- /dev/null +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python +""" +Fetch high-level WikiCommons statistics for Quantifying the Commons. +Generates one dataset: +1) Recursive category data (aggregated by LICENSE TYPE, File Count, Page Count) +Uses Wikimedia Commons API to retrieve metadata +for Creative Commons license categories. +""" + +# Standard library +import argparse +import csv +import os +import sys +import textwrap +import time +import traceback + +# Third-party +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory for shared imports +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +BASE_URL = "https://commons.wikimedia.org/w/api.php" +FILE_WIKICOMMONS = shared.path_join( + PATHS["data_phase"], "data_wikicommons.csv" +) +HEADER_WIKICOMMONS = ["LICENSE TYPE", "File Count", "Page Count"] +ROOT_CATEGORY = "Free_Creative_Commons_licenses" +TIMEOUT = 25 +MAX_RETRIES = 5 +BACKOFF_FACTOR = 10 + + +def parse_arguments(): + """Parse command-line options.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results to CSV.", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions (fetch, merge, add, commit, push).", + ) + + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit recursive depth for testing (optional).", + ) + + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args + + +def get_content_request_url(category): + """Return API endpoint for WikiCommons category metadata.""" + return ( + f"{BASE_URL}?action=query" + f"&prop=categoryinfo" + f"&titles=Category:{category}" + f"&format=json" + ) + + +def get_subcategories(category, session): + """Fetch all subcategories for a + given category, handling pagination""" + all_subcats = [] + cmcontinue = None + + while True: + try: + params = { + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{category}", + "cmtype": "subcat", + "format": "json", + "cmlimit": "max", + } + if cmcontinue: + params["cmcontinue"] = cmcontinue + + resp = session.get(BASE_URL, params=params, timeout=TIMEOUT) + resp.raise_for_status() + data = resp.json() + + members = data.get("query", {}).get("categorymembers", []) + subcats = [ + m["title"].replace("Category:", "").replace("&", "%26") + for m in members + ] + all_subcats.extend(subcats) + + # Handle pagination + if "continue" in data and "cmcontinue" in data["continue"]: + cmcontinue = data["continue"]["cmcontinue"] + time.sleep(0.2) + else: + break + + except Exception as e: + LOGGER.warning( + f"Failed to fetch subcategories for {category}: {e}" + ) + break + + return all_subcats + + +def get_license_contents(category, session): + """Fetch total file and page counts for a category.""" + try: + url = get_content_request_url(category) + resp = session.get(url, timeout=TIMEOUT) + resp.raise_for_status() + data = resp.json() + file_cnt, page_cnt = 0, 0 + for _, info in data.get("query", {}).get("pages", {}).items(): + catinfo = info.get("categoryinfo", {}) + file_cnt += catinfo.get("files", 0) + page_cnt += catinfo.get("pages", 0) + return {"File Count": file_cnt, "Page Count": page_cnt} + except Exception as e: + LOGGER.warning(f"Failed to fetch contents for {category}: {e}") + return {"File Count": 0, "Page Count": 0} + + +def recursive_collect_data(session, root_category, limit=None): + """Recursively traverse WikiCommons categories and collect data.""" + + results = [] + visited = set() + + def traverse(category, path, depth=0): + if limit and depth >= limit: + return + if category in visited: + return + visited.add(category) + + # Get counts for the current category itself + contents = get_license_contents(category, session) + + results.append( + { + "LICENSE TYPE": path, + "File Count": contents["File Count"], + "Page Count": contents["Page Count"], + } + ) + + # Get subcategories + subcats = get_subcategories(category, session) + count = len(subcats) + + # Logging label + label = "categories" if depth == 0 else "subcategories" + if count == 0: + LOGGER.warning(f"Skipping {category} — 0 {label} found.") + else: + LOGGER.info(f"Fetched {count} {label} for {category}.") + + # Recursively traverse subcategories + for sub in subcats: + traverse(sub, f"{path}/{sub}", depth + 1) + time.sleep(0.05) # time to sleep + + # Start traversal from root + traverse(root_category, root_category) + return results + + +def write_data(args, wikicommons_data): + """Write WikiCommons data to CSV.""" + if not args.enable_save: + return args + + os.makedirs(PATHS["data_phase"], exist_ok=True) + with open(FILE_WIKICOMMONS, "w", newline="") as f: + writer = csv.DictWriter( + f, fieldnames=HEADER_WIKICOMMONS, dialect="unix" + ) + writer.writeheader() + writer.writerows(wikicommons_data) + + LOGGER.info(f"Saved {len(wikicommons_data)} rows to {FILE_WIKICOMMONS}.") + return args + + +def main(): + args = parse_arguments() + LOGGER.info("Starting WikiCommons data fetch.") + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + os.makedirs(PATHS["data_phase"], exist_ok=True) + + session = shared.get_session(accept_header="application/json") + wikicommons_data = recursive_collect_data( + session, ROOT_CATEGORY, limit=args.limit + ) + args = write_data(args, wikicommons_data) + + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + "Add WikiCommons dataset for Quantifying the Commons.", + ) + shared.git_push_changes(args, PATHS["repo"]) + + LOGGER.info("WikiCommons fetch completed successfully.") + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) From 8d5366ab781e587352c563bc178ab35c81e12f98 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Wed, 5 Nov 2025 19:36:04 +0300 Subject: [PATCH 02/14] Renamed csv --- scripts/1-fetch/wikicommons_fetch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 45a875f1..40318793 100644 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -31,9 +31,7 @@ # Constants BASE_URL = "https://commons.wikimedia.org/w/api.php" -FILE_WIKICOMMONS = shared.path_join( - PATHS["data_phase"], "data_wikicommons.csv" -) +FILE_WIKICOMMONS = shared.path_join(PATHS["data_phase"], "wikicommons.csv") HEADER_WIKICOMMONS = ["LICENSE TYPE", "File Count", "Page Count"] ROOT_CATEGORY = "Free_Creative_Commons_licenses" TIMEOUT = 25 From 0ee217a87cfe94cc2e053ca9f1ce926601e21242 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 11:30:56 +0300 Subject: [PATCH 03/14] Made it executable --- scripts/1-fetch/wikicommons_fetch.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) mode change 100644 => 100755 scripts/1-fetch/wikicommons_fetch.py diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py old mode 100644 new mode 100755 index 40318793..847e9319 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -35,8 +35,6 @@ HEADER_WIKICOMMONS = ["LICENSE TYPE", "File Count", "Page Count"] ROOT_CATEGORY = "Free_Creative_Commons_licenses" TIMEOUT = 25 -MAX_RETRIES = 5 -BACKOFF_FACTOR = 10 def parse_arguments(): @@ -57,7 +55,7 @@ def parse_arguments(): "--limit", type=int, default=None, - help="Limit recursive depth for testing (optional).", + help="Limit recursive depth for testing.", ) args = parser.parse_args() @@ -122,7 +120,7 @@ def get_subcategories(category, session): return all_subcats -def get_license_contents(category, session): +def fetch_category_totals(category, session): """Fetch total file and page counts for a category.""" try: url = get_content_request_url(category) @@ -140,7 +138,7 @@ def get_license_contents(category, session): return {"File Count": 0, "Page Count": 0} -def recursive_collect_data(session, root_category, limit=None): +def recursive_collect_data(session, limit=None): """Recursively traverse WikiCommons categories and collect data.""" results = [] @@ -154,7 +152,7 @@ def traverse(category, path, depth=0): visited.add(category) # Get counts for the current category itself - contents = get_license_contents(category, session) + contents = fetch_category_totals(category, session) results.append( { @@ -181,7 +179,7 @@ def traverse(category, path, depth=0): time.sleep(0.05) # time to sleep # Start traversal from root - traverse(root_category, root_category) + traverse(ROOT_CATEGORY, ROOT_CATEGORY) return results @@ -207,12 +205,8 @@ def main(): LOGGER.info("Starting WikiCommons data fetch.") shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - os.makedirs(PATHS["data_phase"], exist_ok=True) - session = shared.get_session(accept_header="application/json") - wikicommons_data = recursive_collect_data( - session, ROOT_CATEGORY, limit=args.limit - ) + wikicommons_data = recursive_collect_data(session, limit=args.limit) args = write_data(args, wikicommons_data) args = shared.git_add_and_commit( From 7521245bf66c35c4f7440b9e55092d879228284f Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 12:08:32 +0300 Subject: [PATCH 04/14] Made necessary changes --- scripts/1-fetch/wikicommons_fetch.py | 34 ++++++++++++---------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 847e9319..38d995f3 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -32,7 +32,7 @@ # Constants BASE_URL = "https://commons.wikimedia.org/w/api.php" FILE_WIKICOMMONS = shared.path_join(PATHS["data_phase"], "wikicommons.csv") -HEADER_WIKICOMMONS = ["LICENSE TYPE", "File Count", "Page Count"] +HEADER_WIKICOMMONS = ["LICENSE_TYPE", "File_Count", "Page_Count"] ROOT_CATEGORY = "Free_Creative_Commons_licenses" TIMEOUT = 25 @@ -55,7 +55,7 @@ def parse_arguments(): "--limit", type=int, default=None, - help="Limit recursive depth for testing.", + help="Limit recursive depth for testing", ) args = parser.parse_args() @@ -64,16 +64,6 @@ def parse_arguments(): return args -def get_content_request_url(category): - """Return API endpoint for WikiCommons category metadata.""" - return ( - f"{BASE_URL}?action=query" - f"&prop=categoryinfo" - f"&titles=Category:{category}" - f"&format=json" - ) - - def get_subcategories(category, session): """Fetch all subcategories for a given category, handling pagination""" @@ -123,8 +113,13 @@ def get_subcategories(category, session): def fetch_category_totals(category, session): """Fetch total file and page counts for a category.""" try: - url = get_content_request_url(category) - resp = session.get(url, timeout=TIMEOUT) + params = { + "action": "query", + "prop": "categoryinfo", + "titles": f"Category:{category}", + "format": "json", + } + resp = session.get(BASE_URL, params=params, timeout=TIMEOUT) resp.raise_for_status() data = resp.json() file_cnt, page_cnt = 0, 0 @@ -135,7 +130,7 @@ def fetch_category_totals(category, session): return {"File Count": file_cnt, "Page Count": page_cnt} except Exception as e: LOGGER.warning(f"Failed to fetch contents for {category}: {e}") - return {"File Count": 0, "Page Count": 0} + return {"File Count": None, "Page Count": None} def recursive_collect_data(session, limit=None): @@ -156,9 +151,9 @@ def traverse(category, path, depth=0): results.append( { - "LICENSE TYPE": path, - "File Count": contents["File Count"], - "Page Count": contents["Page Count"], + "LICENSE_TYPE": path, + "File_Count": contents["File Count"], + "Page_Count": contents["Page Count"], } ) @@ -189,7 +184,8 @@ def write_data(args, wikicommons_data): return args os.makedirs(PATHS["data_phase"], exist_ok=True) - with open(FILE_WIKICOMMONS, "w", newline="") as f: + with open(FILE_WIKICOMMONS, "w", encoding="utf-8", newline="\n") as f: + writer = csv.DictWriter( f, fieldnames=HEADER_WIKICOMMONS, dialect="unix" ) From 756243ee6fa69df1b591c52dd786a72366786604 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 12:14:18 +0300 Subject: [PATCH 05/14] Followed header naming conventions --- scripts/1-fetch/wikicommons_fetch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 38d995f3..d2dde66e 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -127,10 +127,10 @@ def fetch_category_totals(category, session): catinfo = info.get("categoryinfo", {}) file_cnt += catinfo.get("files", 0) page_cnt += catinfo.get("pages", 0) - return {"File Count": file_cnt, "Page Count": page_cnt} + return {"File_Count": file_cnt, "Page_Count": page_cnt} except Exception as e: LOGGER.warning(f"Failed to fetch contents for {category}: {e}") - return {"File Count": None, "Page Count": None} + return {"File_Count": None, "Page_Count": None} def recursive_collect_data(session, limit=None): @@ -152,8 +152,8 @@ def traverse(category, path, depth=0): results.append( { "LICENSE_TYPE": path, - "File_Count": contents["File Count"], - "Page_Count": contents["Page Count"], + "File_Count": contents["File_Count"], + "Page_Count": contents["Page_Count"], } ) From 01c93f30a7ae8c3f6154d6878f9ba581ab5f64b4 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 12:17:19 +0300 Subject: [PATCH 06/14] Followed header naming conventions(in caps) --- scripts/1-fetch/wikicommons_fetch.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index d2dde66e..42bd2a5a 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -32,7 +32,7 @@ # Constants BASE_URL = "https://commons.wikimedia.org/w/api.php" FILE_WIKICOMMONS = shared.path_join(PATHS["data_phase"], "wikicommons.csv") -HEADER_WIKICOMMONS = ["LICENSE_TYPE", "File_Count", "Page_Count"] +HEADER_WIKICOMMONS = ["LICENSE_TYPE", "FILE_COUNT", "PAGE_COUNT"] ROOT_CATEGORY = "Free_Creative_Commons_licenses" TIMEOUT = 25 @@ -127,10 +127,10 @@ def fetch_category_totals(category, session): catinfo = info.get("categoryinfo", {}) file_cnt += catinfo.get("files", 0) page_cnt += catinfo.get("pages", 0) - return {"File_Count": file_cnt, "Page_Count": page_cnt} + return {"FILE_COUNT": file_cnt, "PAGE_COUNT": page_cnt} except Exception as e: LOGGER.warning(f"Failed to fetch contents for {category}: {e}") - return {"File_Count": None, "Page_Count": None} + return {"FILE_COUNT": None, "PAGE_COUNT": None} def recursive_collect_data(session, limit=None): @@ -152,8 +152,8 @@ def traverse(category, path, depth=0): results.append( { "LICENSE_TYPE": path, - "File_Count": contents["File_Count"], - "Page_Count": contents["Page_Count"], + "FILE_COUNT": contents["FILE_COUNT"], + "PAGE_COUNT": contents["FILE_COUNT"], } ) From c055c47636336245fba71dea11c63f8325596c97 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 12:45:59 +0300 Subject: [PATCH 07/14] Changed file name --- scripts/1-fetch/wikicommons_fetch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 42bd2a5a..3c35d6f2 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -31,7 +31,9 @@ # Constants BASE_URL = "https://commons.wikimedia.org/w/api.php" -FILE_WIKICOMMONS = shared.path_join(PATHS["data_phase"], "wikicommons.csv") +FILE_WIKICOMMONS = shared.path_join( + PATHS["data_phase"], "wikicommons_fetch.csv" +) HEADER_WIKICOMMONS = ["LICENSE_TYPE", "FILE_COUNT", "PAGE_COUNT"] ROOT_CATEGORY = "Free_Creative_Commons_licenses" TIMEOUT = 25 From 5c48e52499278a19dddf8a17a59d91765ad214a8 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 13:04:12 +0300 Subject: [PATCH 08/14] raise QuantifyingException for failed category fetch instead of returning none --- scripts/1-fetch/wikicommons_fetch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 3c35d6f2..fb8c2e37 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -131,8 +131,9 @@ def fetch_category_totals(category, session): page_cnt += catinfo.get("pages", 0) return {"FILE_COUNT": file_cnt, "PAGE_COUNT": page_cnt} except Exception as e: - LOGGER.warning(f"Failed to fetch contents for {category}: {e}") - return {"FILE_COUNT": None, "PAGE_COUNT": None} + message = f"Failed to fetch contents for {category}: {e}" + LOGGER.error(message) + raise shared.QuantifyingException(message, exit_code=1) def recursive_collect_data(session, limit=None): From 1a0a8b5c380171e045bfebe72e15ffb8f10ed7dc Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Thu, 6 Nov 2025 13:09:04 +0300 Subject: [PATCH 09/14] raise QuantifyingException for failed category fetch instead of returning none --- scripts/1-fetch/wikicommons_fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index fb8c2e37..4b648e18 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -133,7 +133,7 @@ def fetch_category_totals(category, session): except Exception as e: message = f"Failed to fetch contents for {category}: {e}" LOGGER.error(message) - raise shared.QuantifyingException(message, exit_code=1) + raise shared.QuantifyingException(message) def recursive_collect_data(session, limit=None): From 906beeaa15f5832a01e323bed9278828b76ccb5c Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Fri, 14 Nov 2025 16:56:09 +0300 Subject: [PATCH 10/14] Added sources --- sources.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sources.md b/sources.md index 2f559bef..0f64bceb 100644 --- a/sources.md +++ b/sources.md @@ -168,3 +168,19 @@ language edition of wikipedia. It runs on the Meta-Wiki API. - No API key required - Query limit: It is rate-limited only to prevent abuse - Data available through XML or JSON format + +## Wikimedia + +**Description:** Wikimedia Commons is a repository of free-to-use media files. Its API allows users to query files, categories, metadata, and license information. You can retrieve statistics such as file counts, page counts, categories, and subcategories. The API runs on the MediaWiki Action API, similar to Wikipedia, and provides access to information about media files, licenses, and categories across Wikimedia projects. + +**API documentation link:** +[WIKIMEDIA_BASE_URL documentation](https://en.wikipedia.org/w/api.php) +[WIKIMEDIA_BASE_URL reference page](https://www.mediawiki.org/wiki/API:Action_API) + + +**API information** + +- No API key required +- Query limit: Rate-limited to prevent abuse +- Data available in XML or JSON format +- Can query file metadata, category members, and license types From 5c4ea28459e77543318d0fd5b8a0812aa0e3c25e Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Fri, 14 Nov 2025 16:59:28 +0300 Subject: [PATCH 11/14] Added sources --- sources.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources.md b/sources.md index 0f64bceb..a251d654 100644 --- a/sources.md +++ b/sources.md @@ -169,7 +169,7 @@ language edition of wikipedia. It runs on the Meta-Wiki API. - Query limit: It is rate-limited only to prevent abuse - Data available through XML or JSON format -## Wikimedia +## Wikimedia Commons **Description:** Wikimedia Commons is a repository of free-to-use media files. Its API allows users to query files, categories, metadata, and license information. You can retrieve statistics such as file counts, page counts, categories, and subcategories. The API runs on the MediaWiki Action API, similar to Wikipedia, and provides access to information about media files, licenses, and categories across Wikimedia projects. From 35498c04017a7c8865642f587c8f47e34dcaf969 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Mon, 17 Nov 2025 12:14:47 +0300 Subject: [PATCH 12/14] Made necessary changes --- scripts/1-fetch/wikicommons_fetch.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 4b648e18..20a8a16f 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -32,7 +32,7 @@ # Constants BASE_URL = "https://commons.wikimedia.org/w/api.php" FILE_WIKICOMMONS = shared.path_join( - PATHS["data_phase"], "wikicommons_fetch.csv" + PATHS["data_phase"], "wikicommons_legal_tool_counts.csv" ) HEADER_WIKICOMMONS = ["LICENSE_TYPE", "FILE_COUNT", "PAGE_COUNT"] ROOT_CATEGORY = "Free_Creative_Commons_licenses" @@ -156,7 +156,7 @@ def traverse(category, path, depth=0): { "LICENSE_TYPE": path, "FILE_COUNT": contents["FILE_COUNT"], - "PAGE_COUNT": contents["FILE_COUNT"], + "PAGE_COUNT": contents["PAGE_COUNT"], } ) @@ -166,10 +166,7 @@ def traverse(category, path, depth=0): # Logging label label = "categories" if depth == 0 else "subcategories" - if count == 0: - LOGGER.warning(f"Skipping {category} — 0 {label} found.") - else: - LOGGER.info(f"Fetched {count} {label} for {category}.") + LOGGER.info(f"Fetched {count} {label} for {category}.") # Recursively traverse subcategories for sub in subcats: @@ -187,10 +184,12 @@ def write_data(args, wikicommons_data): return args os.makedirs(PATHS["data_phase"], exist_ok=True) - with open(FILE_WIKICOMMONS, "w", encoding="utf-8", newline="\n") as f: + with open( + FILE_WIKICOMMONS, "w", encoding="utf-8", newline="\n" + ) as file_obj: writer = csv.DictWriter( - f, fieldnames=HEADER_WIKICOMMONS, dialect="unix" + file_obj, fieldnames=HEADER_WIKICOMMONS, dialect="unix" ) writer.writeheader() writer.writerows(wikicommons_data) From d928671dcb3feb48fbfa132d250268edc33b2c1e Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Mon, 24 Nov 2025 16:42:40 +0300 Subject: [PATCH 13/14] Included only legal tools --- scripts/1-fetch/wikicommons_fetch.py | 82 +++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 20a8a16f..91336c0c 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -136,6 +136,48 @@ def fetch_category_totals(category, session): raise shared.QuantifyingException(message) +# Helper function to check if a category +# name represents a valid CC license tool +def is_valid_license_tool(category_name): + """ + Checks if a category name corresponds to + an official Creative Commons license tool.. + Official license categories usually start with + 'CC-' followed by a combination + of BY, SA, ND, NC, and a version number (e.g., CC-BY-4.0) + + EXCLUDED CC Licenses (marked 'Not OK' in policy): + - Attribution-NonCommercial (CC BY-NC). + - Attribution-NoDerivs (CC BY-ND). + - Any combination containing NC or ND restrictions. + + + """ + # A list of common patterns to check + if category_name.startswith("CC-") and any( + x in category_name for x in ["BY", "SA"] + ): + # Specific exceptions that look like + # licenses but are markers/subcategories + if "migrated" in category_name or "Retired" in category_name: + return False + return True + + # Check for CC0 Public Domain Dedication (often just "CC0") + if ( + category_name == "CC0" + or category_name.startswith("CC0-") + or category_name == "CC-Zero" + ): + return True + + # The root category itself is not a license tool + if category_name == ROOT_CATEGORY: + return False + + return False + + def recursive_collect_data(session, limit=None): """Recursively traverse WikiCommons categories and collect data.""" @@ -149,27 +191,39 @@ def traverse(category, path, depth=0): return visited.add(category) - # Get counts for the current category itself - contents = fetch_category_totals(category, session) - - results.append( - { - "LICENSE_TYPE": path, - "FILE_COUNT": contents["FILE_COUNT"], - "PAGE_COUNT": contents["PAGE_COUNT"], - } - ) - - # Get subcategories + # Only fetch and collect data for valid license tools + if is_valid_license_tool(category): + try: + # Get counts for the current category + contents = fetch_category_totals(category, session) + + results.append( + { + # Use the specific license category name + # as the LICENSE_TYPE + "LICENSE_TYPE": category, + "FILE_COUNT": contents["FILE_COUNT"], + "PAGE_COUNT": contents["PAGE_COUNT"], + } + ) + except shared.QuantifyingException as e: + # Log the specific license category failure + LOGGER.error( + f"Failed to process valid license category {category}: {e}" + ) + + # Get subcategories (check subcategories, + # as a valid license might be nested under a non-license category) subcats = get_subcategories(category, session) - count = len(subcats) # Logging label label = "categories" if depth == 0 else "subcategories" - LOGGER.info(f"Fetched {count} {label} for {category}.") + LOGGER.info(f"Fetched {len(subcats)} {label} for {category}.") # Recursively traverse subcategories for sub in subcats: + # Use the subcategory name as the 'path' for traversal, + # but use the category name for the final result. traverse(sub, f"{path}/{sub}", depth + 1) time.sleep(0.05) # time to sleep From 641855e0225dd7b2d4d22db55cbb79797eae2bc2 Mon Sep 17 00:00:00 2001 From: Joy Akinyi Date: Mon, 24 Nov 2025 16:44:20 +0300 Subject: [PATCH 14/14] Rearranged order of sources --- sources.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/sources.md b/sources.md index a251d654..10f47b1d 100644 --- a/sources.md +++ b/sources.md @@ -146,6 +146,21 @@ license_version breakdown. - Media types: `images`, `audio` - Supported licenses: `by`, `by-nc`, `by-nc-nd`, `by-nc-sa`, `by-nd`, `by-sa`, `cc0`, `nc-sampling+`, `pdm`, `sampling+` +## Wikimedia Commons + +**Description:** Wikimedia Commons is a repository of free-to-use media files. Its API allows users to query files, categories, metadata, and license information. You can retrieve statistics such as file counts, page counts, categories, and subcategories. The API runs on the MediaWiki Action API, similar to Wikipedia, and provides access to information about media files, licenses, and categories across Wikimedia projects. + +**API documentation link:** +[WIKIMEDIA_BASE_URL documentation](https://en.wikipedia.org/w/api.php) +[WIKIMEDIA_BASE_URL reference page](https://www.mediawiki.org/wiki/API:Action_API) + + +**API information** + +- No API key required +- Query limit: Rate-limited to prevent abuse +- Data available in XML or JSON format +- Can query file metadata, category members, and license types ## Wikipedia @@ -168,19 +183,3 @@ language edition of wikipedia. It runs on the Meta-Wiki API. - No API key required - Query limit: It is rate-limited only to prevent abuse - Data available through XML or JSON format - -## Wikimedia Commons - -**Description:** Wikimedia Commons is a repository of free-to-use media files. Its API allows users to query files, categories, metadata, and license information. You can retrieve statistics such as file counts, page counts, categories, and subcategories. The API runs on the MediaWiki Action API, similar to Wikipedia, and provides access to information about media files, licenses, and categories across Wikimedia projects. - -**API documentation link:** -[WIKIMEDIA_BASE_URL documentation](https://en.wikipedia.org/w/api.php) -[WIKIMEDIA_BASE_URL reference page](https://www.mediawiki.org/wiki/API:Action_API) - - -**API information** - -- No API key required -- Query limit: Rate-limited to prevent abuse -- Data available in XML or JSON format -- Can query file metadata, category members, and license types