InternScience · ChenZiHong-Gavin · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/examples/generate/generate_fill_in_blank_qa/README.md b/examples/generate/generate_fill_in_blank_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Fill-in-blank QAs
+
+Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.
diff --git a/examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml b/examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: fill_in_blank
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/examples/generate/generate_fill_in_blank_qa/generate_fill_in_blank.sh b/examples/generate/generate_fill_in_blank_qa/generate_fill_in_blank.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
diff --git a/examples/generate/generate_multi_answer_qa/README.md b/examples/generate/generate_multi_answer_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Multi-Answer QAs
+
+Multi-answer question answering (QA) involves generating questions that can have multiple valid answers. This is particularly useful in educational settings, surveys, and research where diverse perspectives are valuable. 
diff --git a/examples/generate/generate_multi_answer_qa/generate_multi_answer.sh b/examples/generate/generate_multi_answer_qa/generate_multi_answer.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_multi_answer_qa/multi_answer_config.yaml
diff --git a/examples/generate/generate_multi_answer_qa/multi_answer_config.yaml b/examples/generate/generate_multi_answer_qa/multi_answer_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: multi_answer
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/examples/generate/generate_multi_choice_qa/README.md b/examples/generate/generate_multi_choice_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Multi-Choice QAs
+
+Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.
diff --git a/examples/generate/generate_multi_choice_qa/generate_multi_choice.sh b/examples/generate/generate_multi_choice_qa/generate_multi_choice.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_multi_choice_qa/multi_choice_config.yaml
diff --git a/examples/generate/generate_multi_choice_qa/multi_choice_config.yaml b/examples/generate/generate_multi_choice_qa/multi_choice_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: multi_choice
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/bases/base_generator.py b/graphgen/bases/base_generator.py
@@ -46,38 +46,47 @@ async def generate(
     def format_generation_results(
         results: list[dict], output_data_format: str
     ) -> list[dict[str, Any]]:
-        if output_data_format == "Alpaca":
-            results = [
-                {
-                    "instruction": v["question"],
-                    "input": "",
-                    "output": v["answer"],
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        elif output_data_format == "Sharegpt":
-            results = [
-                {
-                    "conversations": [
-                        {"from": "human", "value": v["question"]},
-                        {"from": "gpt", "value": v["answer"]},
-                    ]
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        elif output_data_format == "ChatML":
-            results = [
-                {
-                    "messages": [
-                        {"role": "user", "content": v["question"]},
-                        {"role": "assistant", "content": v["answer"]},
-                    ]
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        else:
-            raise ValueError(f"Unknown output data format: {output_data_format}")
-        return results
+
+        flat_results = []
+        for item in results:
+            for _, qa_data in item.items():
+                question = qa_data.get("question", "")
+                answer = qa_data.get("answer", "")
+                if "options" in qa_data and qa_data["options"]:
+                    options = qa_data["options"]
+                    options_str = "\n".join(
+                        [f"{key}. {options[key]}" for key in sorted(options.keys())]
+                    )
+                    question += f"\nOptions:\n{options_str}"
+
+                if output_data_format == "Alpaca":
+                    flat_results.append(
+                        {
+                            "instruction": question,
+                            "input": "",
+                            "output": answer,
+                        }
+                    )
+                elif output_data_format == "Sharegpt":
+                    flat_results.append(
+                        {
+                            "conversations": [
+                                {"from": "human", "value": question},
+                                {"from": "gpt", "value": answer},
+                            ]
+                        }
+                    )
+                elif output_data_format == "ChatML":
+                    flat_results.append(
+                        {
+                            "messages": [
+                                {"role": "user", "content": question},
+                                {"role": "assistant", "content": answer},
+                            ]
+                        }
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown output data format: {output_data_format}"
+                    )
+        return flat_results
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -11,6 +11,9 @@
     AggregatedGenerator,
     AtomicGenerator,
     CoTGenerator,
+    FillInBlankGenerator,
+    MultiAnswerGenerator,
+    MultiChoiceGenerator,
     MultiHopGenerator,
     QuizGenerator,
     VQAGenerator,

diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py
@@ -1,6 +1,9 @@
 from .aggregated_generator import AggregatedGenerator
 from .atomic_generator import AtomicGenerator
 from .cot_generator import CoTGenerator
+from .fill_in_blank_generator import FillInBlankGenerator
+from .multi_answer_generator import MultiAnswerGenerator
+from .multi_choice_generator import MultiChoiceGenerator
 from .multi_hop_generator import MultiHopGenerator
 from .quiz_generator import QuizGenerator
 from .vqa_generator import VQAGenerator
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Generate Fill-in-blank QAs

		Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python3 -m graphgen.run \
		--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Generate Multi-Answer QAs

		Multi-answer question answering (QA) involves generating questions that can have multiple valid answers. This is particularly useful in educational settings, surveys, and research where diverse perspectives are valuable.
Copy link Contributor gemini-code-assist bot Jan 15, 2026 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. This file is missing a final newline character. It's a common convention to end files with a newline to ensure consistency and prevent issues with some tools.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Generate Multi-Choice QAs

		Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.