Merge pull request #61 from TogetherCrew/feat/60-limit-mediawiki-load-batch-size

amindadgar · web-flow · commit 3745b997352c · 2025-06-29T16:45:12.000+03:30
feat: limit batch size to 1!
diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py
@@ -103,7 +103,9 @@ def load(self, documents: list[Document]) -> None:
         )
         
         # Process batches in parallel using ThreadPoolExecutor
-        batch_size = 1000
+        # TODO: Revert to larger batch size once llama-index loading issue is resolved
+        # See: https://github.com/TogetherCrew/temporal-worker-python/issues/60
+        batch_size = 1
         batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]
         
         with ThreadPoolExecutor(max_workers=10) as executor:

Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,9 @@ def load(self, documents: list[Document]) -> None:`
`103`	`103`	`)`
`104`	`104`
`105`	`105`	`# Process batches in parallel using ThreadPoolExecutor`
`106`		`- batch_size = 1000`
	`106`	`+ # TODO: Revert to larger batch size once llama-index loading issue is resolved`
	`107`	`+ # See: https://github.com/TogetherCrew/temporal-worker-python/issues/60`
	`108`	`+ batch_size = 1`
`107`	`109`	`batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]`
`108`	`110`
`109`	`111`	`with ThreadPoolExecutor(max_workers=10) as executor:`