Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/rephrase/rephrase_style_controlled/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Rephrase with Style Control
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python3 -m graphgen.run \
--config_file examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
global_params:
working_dir: cache
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv

nodes:
- id: read
op_name: read
type: source
dependencies: []
params:
input_path:
- examples/input_examples/json_demo.json

- id: chunk
op_name: chunk
type: map_batch
dependencies:
- read
execution_params:
replicas: 4
params:
chunk_size: 2048 # larger chunk size for better context
chunk_overlap: 200

- id: rephrase
op_name: rephrase
type: map_batch
dependencies:
- chunk
execution_params:
replicas: 1
batch_size: 128
save_output: true
params:
method: style_controlled
style: critical_analysis
3 changes: 2 additions & 1 deletion graphgen/bases/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from .base_evaluator import BaseEvaluator
from .base_extractor import BaseExtractor
from .base_generator import BaseGenerator
from .base_kg_builder import BaseKGBuilder
from .base_llm_wrapper import BaseLLMWrapper
from .base_operator import BaseOperator
from .base_partitioner import BasePartitioner
from .base_reader import BaseReader
from .base_rephraser import BaseRephraser
from .base_searcher import BaseSearcher
from .base_splitter import BaseSplitter
from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace
from .base_tokenizer import BaseTokenizer
from .base_evaluator import BaseEvaluator
from .datatypes import Chunk, Config, Node, QAPair, Token
31 changes: 31 additions & 0 deletions graphgen/bases/base_rephraser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from abc import ABC, abstractmethod
from typing import Any

from graphgen.bases.base_llm_wrapper import BaseLLMWrapper


class BaseRephraser(ABC):
"""
Rephrase text based on given prompts.
"""

def __init__(self, llm_client: BaseLLMWrapper):
self.llm_client = llm_client

@abstractmethod
def build_prompt(self, text: str) -> str:
"""Build prompt for LLM based on the given text"""

@staticmethod
@abstractmethod
def parse_response(response: str) -> Any:
"""Parse the LLM response and return the rephrased text"""

async def rephrase(
self,
item: dict,
) -> dict:
text = item["content"]
prompt = self.build_prompt(text)
response = await self.llm_client.generate_answer(prompt)
return self.parse_response(response)
1 change: 1 addition & 0 deletions graphgen/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
RDFReader,
TXTReader,
)
from .rephraser import StyleControlledRephraser
from .searcher.db.ncbi_searcher import NCBISearch
from .searcher.db.rnacentral_searcher import RNACentralSearch
from .searcher.db.uniprot_searcher import UniProtSearch
Expand Down
1 change: 1 addition & 0 deletions graphgen/models/rephraser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .style_controlled_rephraser import StyleControlledRephraser
33 changes: 33 additions & 0 deletions graphgen/models/rephraser/style_controlled_rephraser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Any, Optional

from graphgen.bases import BaseRephraser
from graphgen.templates import STYLE_CONTROLLED_REPHRASING_PROMPTS
from graphgen.utils import compute_content_hash, detect_main_language, logger


class StyleControlledRephraser(BaseRephraser):
"""
Style Controlled Rephraser rephrases the input text based on a specified style.
"""

def __init__(self, llm_client: Any, style: str = "critical_analysis"):
super().__init__(llm_client)
self.style = style

def build_prompt(self, text: str) -> str:
logger.debug("Text to be rephrased: %s", text)
language = detect_main_language(text)
prompt_template = STYLE_CONTROLLED_REPHRASING_PROMPTS[self.style][language]
prompt = prompt_template.format(text=text)
return prompt

@staticmethod
def parse_response(response: str) -> Optional[dict]:
result = response.strip()
logger.debug("Raw rephrased response: %s", result)
if not result:
return None
return {
"_rephrased_id": compute_content_hash(result),
"content": result,
}
3 changes: 2 additions & 1 deletion graphgen/operators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from .partition import PartitionService
from .quiz import QuizService
from .read import read
from .rephrase import RephraseService
from .search import SearchService


operators = {
"read": read,
"chunk": ChunkService,
Expand All @@ -21,4 +21,5 @@
"partition": PartitionService,
"generate": GenerateService,
"evaluate": EvaluateService,
"rephrase": RephraseService,
}
1 change: 1 addition & 0 deletions graphgen/operators/rephrase/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .rephrase_service import RephraseService
48 changes: 48 additions & 0 deletions graphgen/operators/rephrase/rephrase_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd

from graphgen.bases import BaseLLMWrapper, BaseOperator
from graphgen.common import init_llm
from graphgen.utils import run_concurrent


class RephraseService(BaseOperator):
"""
Generate question-answer pairs based on nodes and edges.
"""

def __init__(
self,
working_dir: str = "cache",
method: str = "aggregated",
**rephrase_kwargs,
):
super().__init__(working_dir=working_dir, op_name="rephrase_service")
self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
self.method = method
self.rephrase_kwargs = rephrase_kwargs

if self.method == "style_controlled":
from graphgen.models import StyleControlledRephraser

self.rephraser = StyleControlledRephraser(
self.llm_client,
style=rephrase_kwargs.get("style", "critical_analysis"),
)
else:
raise ValueError(f"Unsupported rephrase method: {self.method}")

def process(self, batch: pd.DataFrame) -> pd.DataFrame:
items = batch.to_dict(orient="records")
return pd.DataFrame(self.rephrase(items))

def rephrase(self, items: list[dict]) -> list[dict]:
results = run_concurrent(
self.rephraser.rephrase,
items,
desc="Rephrasing texts",
unit="batch",
)

# Filter out empty results
results = [res for res in results if res]
return results
1 change: 1 addition & 0 deletions graphgen/templates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
VQA_GENERATION_PROMPT,
)
from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
from .rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS
from .search_judgement import SEARCH_JUDGEMENT_PROMPT
from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
1 change: 1 addition & 0 deletions graphgen/templates/rephrasing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .style_controlled_rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .critical_analysis_rephrasing import CRITICAL_ANALYSIS_REPHRASING_PROMPTS
from .cross_domain_analogy_rephrasing import CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS
from .executive_summary_rephrasing import EXECUTIVE_SUMMARY_REPHRASING_PROMPTS
from .first_person_narrative_rephrasing import FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS
from .historical_evolution_perspective_rephrasing import (
HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS,
)
from .popular_science_rephrasing import POPULAR_SCIENCE_REPHRASING_PROMPTS
from .qa_dialogue_format_rephrasing import QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS
from .technical_deep_dive_rephrasing import TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS

STYLE_CONTROLLED_REPHRASING_PROMPTS = {
"popular_science": POPULAR_SCIENCE_REPHRASING_PROMPTS,
"critical_analysis": CRITICAL_ANALYSIS_REPHRASING_PROMPTS,
"cross_domain_analogy": CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS,
"technical_deep_dive": TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS,
"executive_summary": EXECUTIVE_SUMMARY_REPHRASING_PROMPTS,
"first_person_narrative": FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS,
"historical_evolution_perspective": HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS,
"qa_dialogue_format": QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
TEMPLATE_ZH = """
【任务】以学术批判视角改写以下内容,形成技术评论文章。

【核心要求】
1. 语气风格:客观理性,第三人称学术视角,使用规范学术用语
2. 内容结构:
- 准确总结原文核心方法/发现(占比40%)
- 分析技术优势与创新点(占比20%)
- 指出潜在局限性与假设条件(占比20%)
- 提出可能的改进方向或未来工作(占比20%)
3. 引用规范:保留原文所有关键引用,采用标准学术引用格式
4. 事实准确性:不得歪曲或误读原文技术细节

【输出格式】
- 标题:原标题 + ":一项批判性分析"
- 段落:标准学术论文章节结构
- 字数:与原文相当或略长

原文内容:
{text}

请输出批判性分析改写版本:
"""

TEMPLATE_EN = """
【Task】Rewrite the following content from an academic critical perspective as a technical commentary.

【Core Requirements】
1. Tone: Objective and rational, third-person academic perspective, using standard academic terminology
2. Structure:
- Accurately summarize core methods/findings (40% of content)
- Analyze technical advantages and innovations (20%)
- Identify potential limitations and assumptions (20%)
- Propose possible improvements or future work (20%)
3. Citations: Retain all key references from original, using standard academic citation format
4. Factual Accuracy: Do not distort or misinterpret technical details

【Output Format】
- Title: Original Title + ": A Critical Analysis"
- Paragraphs: Standard academic paper structure
- Length: Similar to or slightly longer than original

Original Content:
{text}

Please output the critically analyzed rewrite:
"""

CRITICAL_ANALYSIS_REPHRASING_PROMPTS = {
"zh": TEMPLATE_ZH,
"en": TEMPLATE_EN,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
TEMPLATE_ZH = """
【任务】通过跨领域类比解释技术概念。

【类比原则】
- 类比源领域:生物学、物理学、建筑学、经济学、烹饪等领域
- 类比强度:类比关系需直观且深刻,避免牵强附会
- 目标:降低理解门槛,同时保持技术严谨性

【核心要求】
1. 双轨并行:每个技术概念配一个恰当类比
2. 类比结构:
- 先介绍技术概念(准确、完整)
- 再引入类比对象及其映射关系
- 最后说明类比局限性和适用范围
3. 保真红线:技术部分必须与原文完全一致,不得因类比而简化
4. 创新性:鼓励使用新颖、出人意料但合理的类比
5. 篇幅:可比原文扩展20-40%

【评估标准】
- 类比恰当性(技术概念与类比对象的核心机制必须同构)
- 技术准确性(不得扭曲事实)
- 启发性(帮助读者建立深层理解)

原文内容:
{text}

请输出跨领域类比版本:
"""

TEMPLATE_EN = """
【Task】Explain technical concepts through cross-domain analogies.

【Analogy Principles】
- Source Domains: Biology, physics, architecture, economics, cooking, etc.
- Strength: Analogy should be intuitive yet profound, avoid forced comparisons
- Goal: Lower understanding barrier while maintaining technical rigor

【Core Requirements】
1. Dual Track: Pair each technical concept with an appropriate analogy
2. Analogy Structure:
- First introduce technical concept (accurate and complete)
- Then introduce analogy object and mapping relationship
- Finally explain analogy limitations and applicable scope
3. Fidelity Baseline: Technical parts must be identical to original, no simplification for analogy's sake
4. Innovation: Encourage novel, surprising but reasonable analogies
5. Length: May expand 20-40% beyond original

【Evaluation Criteria】
- Analogy Appropriateness (core mechanisms must be isomorphic)
- Technical Accuracy (no factual distortion)
- Heuristic Value (helps build deep understanding)

Original Content:
{text}

Please output the cross-domain analogy version:
"""

CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS = {
"zh": TEMPLATE_ZH,
"en": TEMPLATE_EN,
}
Loading