Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hivemind_etl/mediawiki/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def transform(self) -> list[Document]:
def load(self, documents: list[Document]) -> None:
logging.info(f"Loading {len(documents)} documents into Qdrant!")
ingestion_pipeline = CustomIngestionPipeline(
self.community_id, collection_name=self.platform_id
self.community_id, collection_name=self.platform_id, use_cache=False,
)

# Process batches in parallel using ThreadPoolExecutor
Expand Down
2 changes: 2 additions & 0 deletions hivemind_etl/simple_ingestion/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ async def process_document(
pipeline = CustomIngestionPipeline(
community_id=ingestion_request.communityId,
collection_name=collection_name,
use_cache=False,
)

document = Document(
Expand Down Expand Up @@ -188,6 +189,7 @@ async def process_documents_batch(
pipeline = CustomIngestionPipeline(
community_id=batch_chunk.communityId,
collection_name=collection_name,
use_cache=False,
)

# Convert all documents in this chunk to Document objects
Expand Down
2 changes: 1 addition & 1 deletion hivemind_etl/website/website_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(

# preparing the ingestion pipeline
self.ingestion_pipeline = CustomIngestionPipeline(
self.community_id, collection_name=self.platform_id
self.community_id, collection_name=self.platform_id, use_cache=False,
)

async def extract(
Expand Down
2 changes: 2 additions & 0 deletions hivemind_summarizer/activities.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ async def fetch_platform_summaries_by_date(
pipeline = CustomIngestionPipeline(
community_id=community_id,
collection_name=f"{input.platform_id}_summary",
use_cache=False,
)
# get the latest date from the collection
latest_date = pipeline.get_latest_document_date(
Expand Down Expand Up @@ -211,6 +212,7 @@ async def fetch_platform_summaries_by_date_range(
extract_text_only=extract_text_only,
platform_id=input.platform_id,
community_id=community_id,
use_cache=False,
)
summaries = await fetch_platform_summaries_by_date(date_input)
result[date] = summaries
Expand Down
Loading