InternScience · ChenZiHong-Gavin · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/examples/rephrase/rephrase_style_controlled/README.md b/examples/rephrase/rephrase_style_controlled/README.md
@@ -0,0 +1 @@
+# Rephrase with Style Control
diff --git a/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh b/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml
diff --git a/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml b/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml
@@ -0,0 +1,36 @@
+global_params:
+  working_dir: cache
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/json_demo.json
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 2048 # larger chunk size for better context
+      chunk_overlap: 200
+
+  - id: rephrase
+    op_name: rephrase
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true
+    params:
+      method: style_controlled
+      style: critical_analysis
diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py
@@ -1,13 +1,14 @@
+from .base_evaluator import BaseEvaluator
 from .base_extractor import BaseExtractor
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper
 from .base_operator import BaseOperator
 from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
+from .base_rephraser import BaseRephraser
 from .base_searcher import BaseSearcher
 from .base_splitter import BaseSplitter
 from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace
 from .base_tokenizer import BaseTokenizer
-from .base_evaluator import BaseEvaluator
 from .datatypes import Chunk, Config, Node, QAPair, Token
diff --git a/graphgen/bases/base_rephraser.py b/graphgen/bases/base_rephraser.py
@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+
+
+class BaseRephraser(ABC):
+    """
+    Rephrase text based on given prompts.
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+
+    @abstractmethod
+    def build_prompt(self, text: str) -> str:
+        """Build prompt for LLM based on the given text"""
+
+    @staticmethod
+    @abstractmethod
+    def parse_response(response: str) -> Any:
+        """Parse the LLM response and return the rephrased text"""
+
+    async def rephrase(
+        self,
+        item: dict,
+    ) -> dict:
+        text = item["content"]
+        prompt = self.build_prompt(text)
+        response = await self.llm_client.generate_answer(prompt)
+        return self.parse_response(response)
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -37,6 +37,7 @@
     RDFReader,
     TXTReader,
 )
+from .rephraser import StyleControlledRephraser
 from .searcher.db.ncbi_searcher import NCBISearch
 from .searcher.db.rnacentral_searcher import RNACentralSearch
 from .searcher.db.uniprot_searcher import UniProtSearch

diff --git a/graphgen/models/rephraser/__init__.py b/graphgen/models/rephraser/__init__.py
@@ -0,0 +1 @@
+from .style_controlled_rephraser import StyleControlledRephraser
diff --git a/graphgen/models/rephraser/style_controlled_rephraser.py b/graphgen/models/rephraser/style_controlled_rephraser.py
@@ -0,0 +1,33 @@
+from typing import Any, Optional
+
+from graphgen.bases import BaseRephraser
+from graphgen.templates import STYLE_CONTROLLED_REPHRASING_PROMPTS
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class StyleControlledRephraser(BaseRephraser):
+    """
+    Style Controlled Rephraser rephrases the input text based on a specified style.
+    """
+
+    def __init__(self, llm_client: Any, style: str = "critical_analysis"):
+        super().__init__(llm_client)
+        self.style = style
+
+    def build_prompt(self, text: str) -> str:
+        logger.debug("Text to be rephrased: %s", text)
+        language = detect_main_language(text)
+        prompt_template = STYLE_CONTROLLED_REPHRASING_PROMPTS[self.style][language]
+        prompt = prompt_template.format(text=text)
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> Optional[dict]:
+        result = response.strip()
+        logger.debug("Raw rephrased response: %s", result)
+        if not result:
+            return None
+        return {
+            "_rephrased_id": compute_content_hash(result),
+            "content": result,
+        }
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -7,9 +7,9 @@
 from .partition import PartitionService
 from .quiz import QuizService
 from .read import read
+from .rephrase import RephraseService
 from .search import SearchService
 
-
 operators = {
     "read": read,
     "chunk": ChunkService,
@@ -21,4 +21,5 @@
     "partition": PartitionService,
     "generate": GenerateService,
     "evaluate": EvaluateService,
+    "rephrase": RephraseService,
 }
diff --git a/graphgen/operators/rephrase/__init__.py b/graphgen/operators/rephrase/__init__.py
@@ -0,0 +1 @@
+from .rephrase_service import RephraseService
diff --git a/graphgen/operators/rephrase/rephrase_service.py b/graphgen/operators/rephrase/rephrase_service.py
@@ -0,0 +1,48 @@
+import pandas as pd
+
+from graphgen.bases import BaseLLMWrapper, BaseOperator
+from graphgen.common import init_llm
+from graphgen.utils import run_concurrent
+
+
+class RephraseService(BaseOperator):
+    """
+    Generate question-answer pairs based on nodes and edges.
+    """
+
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        method: str = "aggregated",
+        **rephrase_kwargs,
+    ):
+        super().__init__(working_dir=working_dir, op_name="rephrase_service")
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+        self.method = method
+        self.rephrase_kwargs = rephrase_kwargs
+
+        if self.method == "style_controlled":
+            from graphgen.models import StyleControlledRephraser
+
+            self.rephraser = StyleControlledRephraser(
+                self.llm_client,
+                style=rephrase_kwargs.get("style", "critical_analysis"),
+            )
+        else:
+            raise ValueError(f"Unsupported rephrase method: {self.method}")
+
+    def process(self, batch: pd.DataFrame) -> pd.DataFrame:
+        items = batch.to_dict(orient="records")
+        return pd.DataFrame(self.rephrase(items))
+
+    def rephrase(self, items: list[dict]) -> list[dict]:
+        results = run_concurrent(
+            self.rephraser.rephrase,
+            items,
+            desc="Rephrasing texts",
+            unit="batch",
+        )
+
+        # Filter out empty results
+        results = [res for res in results if res]
+        return results
diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py
@@ -14,5 +14,6 @@
     VQA_GENERATION_PROMPT,
 )
 from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
+from .rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
diff --git a/graphgen/templates/rephrasing/__init__.py b/graphgen/templates/rephrasing/__init__.py
@@ -0,0 +1 @@
+from .style_controlled_rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py
@@ -0,0 +1,21 @@
+from .critical_analysis_rephrasing import CRITICAL_ANALYSIS_REPHRASING_PROMPTS
+from .cross_domain_analogy_rephrasing import CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS
+from .executive_summary_rephrasing import EXECUTIVE_SUMMARY_REPHRASING_PROMPTS
+from .first_person_narrative_rephrasing import FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS
+from .historical_evolution_perspective_rephrasing import (
+    HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS,
+)
+from .popular_science_rephrasing import POPULAR_SCIENCE_REPHRASING_PROMPTS
+from .qa_dialogue_format_rephrasing import QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS
+from .technical_deep_dive_rephrasing import TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS
+
+STYLE_CONTROLLED_REPHRASING_PROMPTS = {
+    "popular_science": POPULAR_SCIENCE_REPHRASING_PROMPTS,
+    "critical_analysis": CRITICAL_ANALYSIS_REPHRASING_PROMPTS,
+    "cross_domain_analogy": CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS,
+    "technical_deep_dive": TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS,
+    "executive_summary": EXECUTIVE_SUMMARY_REPHRASING_PROMPTS,
+    "first_person_narrative": FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS,
+    "historical_evolution_perspective": HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS,
+    "qa_dialogue_format": QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS,
+}
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py
@@ -0,0 +1,52 @@
+TEMPLATE_ZH = """
+【任务】以学术批判视角改写以下内容，形成技术评论文章。
+
+【核心要求】
+1. 语气风格：客观理性，第三人称学术视角，使用规范学术用语
+2. 内容结构：
+   - 准确总结原文核心方法/发现（占比40%）
+   - 分析技术优势与创新点（占比20%）
+   - 指出潜在局限性与假设条件（占比20%）
+   - 提出可能的改进方向或未来工作（占比20%）
+3. 引用规范：保留原文所有关键引用，采用标准学术引用格式
+4. 事实准确性：不得歪曲或误读原文技术细节
+
+【输出格式】
+- 标题：原标题 + "：一项批判性分析"
+- 段落：标准学术论文章节结构
+- 字数：与原文相当或略长
+
+原文内容：
+{text}
+
+请输出批判性分析改写版本：
+"""
+
+TEMPLATE_EN = """
+【Task】Rewrite the following content from an academic critical perspective as a technical commentary.
+
+【Core Requirements】
+1. Tone: Objective and rational, third-person academic perspective, using standard academic terminology
+2. Structure:
+   - Accurately summarize core methods/findings (40% of content)
+   - Analyze technical advantages and innovations (20%)
+   - Identify potential limitations and assumptions (20%)
+   - Propose possible improvements or future work (20%)
+3. Citations: Retain all key references from original, using standard academic citation format
+4. Factual Accuracy: Do not distort or misinterpret technical details
+
+【Output Format】
+- Title: Original Title + ": A Critical Analysis"
+- Paragraphs: Standard academic paper structure
+- Length: Similar to or slightly longer than original
+
+Original Content:
+{text}
+
+Please output the critically analyzed rewrite:
+"""
+
+CRITICAL_ANALYSIS_REPHRASING_PROMPTS = {
+    "zh": TEMPLATE_ZH,
+    "en": TEMPLATE_EN,
+}
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py
@@ -0,0 +1,62 @@
+TEMPLATE_ZH = """
+【任务】通过跨领域类比解释技术概念。
+
+【类比原则】
+- 类比源领域：生物学、物理学、建筑学、经济学、烹饪等领域
+- 类比强度：类比关系需直观且深刻，避免牵强附会
+- 目标：降低理解门槛，同时保持技术严谨性
+
+【核心要求】
+1. 双轨并行：每个技术概念配一个恰当类比
+2. 类比结构：
+   - 先介绍技术概念（准确、完整）
+   - 再引入类比对象及其映射关系
+   - 最后说明类比局限性和适用范围
+3. 保真红线：技术部分必须与原文完全一致，不得因类比而简化
+4. 创新性：鼓励使用新颖、出人意料但合理的类比
+5. 篇幅：可比原文扩展20-40%
+
+【评估标准】
+- 类比恰当性（技术概念与类比对象的核心机制必须同构）
+- 技术准确性（不得扭曲事实）
+- 启发性（帮助读者建立深层理解）
+
+原文内容：
+{text}
+
+请输出跨领域类比版本：
+"""
+
+TEMPLATE_EN = """
+【Task】Explain technical concepts through cross-domain analogies.
+
+【Analogy Principles】
+- Source Domains: Biology, physics, architecture, economics, cooking, etc.
+- Strength: Analogy should be intuitive yet profound, avoid forced comparisons
+- Goal: Lower understanding barrier while maintaining technical rigor
+
+【Core Requirements】
+1. Dual Track: Pair each technical concept with an appropriate analogy
+2. Analogy Structure:
+   - First introduce technical concept (accurate and complete)
+   - Then introduce analogy object and mapping relationship
+   - Finally explain analogy limitations and applicable scope
+3. Fidelity Baseline: Technical parts must be identical to original, no simplification for analogy's sake
+4. Innovation: Encourage novel, surprising but reasonable analogies
+5. Length: May expand 20-40% beyond original
+
+【Evaluation Criteria】
+- Analogy Appropriateness (core mechanisms must be isomorphic)
+- Technical Accuracy (no factual distortion)
+- Heuristic Value (helps build deep understanding)
+
+Original Content:
+{text}
+
+Please output the cross-domain analogy version:
+"""
+
+CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS = {
+    "zh": TEMPLATE_ZH,
+    "en": TEMPLATE_EN,
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python3 -m graphgen.run \
		--config_file examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .style_controlled_rephraser import StyleControlledRephraser
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .style_controlled_rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS