diff --git a/README.md b/README.md index 740909f5e..b825c03e4 100644 --- a/README.md +++ b/README.md @@ -354,6 +354,29 @@ llm: model: "moa&readurls-o3" # Test-time compute + web access ``` + +
+🕵 Aliyun BaiLian(CoT analysis) +The `enable_thinking` parameter can only be enabled when specifying a model in the `models` list. + +```yaml +# config.yaml +llm: + api_base: "https://dashscope.aliyuncs.com/compatible-mode/v1" + thinking_budget: 2048 + models: + - name: "qwen-flash" + weight: 0.5 + enable_thinking: false + - name: "qwen-plus" + weight: 0.5 + enable_thinking: true +``` + +```bash +export OPENAI_API_KEY="your-bailian-api-key" +``` +
## Examples Gallery @@ -450,12 +473,18 @@ random_seed: 42 # Full reproducibility llm: # Ensemble configuration + api_base: "https://dashscope.aliyuncs.com/compatible-mode/v1" + thinking_budget: 2048 # budget for thinking models, only work when enable_thinking models: - - name: "gemini-2.5-pro" + - name: "qwen-plus" weight: 0.6 - - name: "gemini-2.5-flash" + enable_thinking: true # important: if the API provider donot provide this paramters, set this will cause fail! + - name: "qwen-flash" weight: 0.4 + enable_thinking: false temperature: 0.7 + max_tokens: 16384 + timeout: 300 database: # MAP-Elites quality-diversity diff --git a/openevolve/config.py b/openevolve/config.py index e01db8697..542819eab 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -72,6 +72,7 @@ class LLMModelConfig: timeout: int = None retries: int = None retry_delay: int = None + thinking_budget: Optional[int] = None # Reproducibility random_seed: Optional[int] = None @@ -79,6 +80,9 @@ class LLMModelConfig: # Reasoning parameters reasoning_effort: Optional[str] = None + # enable_thinking (most used in Chinese providers) + enable_thinking: Optional[bool] = None + def __post_init__(self): """Post-initialization to resolve ${VAR} env var references in api_key""" self.api_key = _resolve_env_var(self.api_key) @@ -96,6 +100,7 @@ class LLMConfig(LLMModelConfig): temperature: float = 0.7 top_p: float = 0.95 max_tokens: int = 4096 + thinking_budget: Optional[int] = None # Request parameters timeout: int = 60 @@ -125,7 +130,8 @@ def __post_init__(self): if self.primary_model: # Create primary model primary_model = LLMModelConfig( - name=self.primary_model, weight=self.primary_model_weight or 1.0 + name=self.primary_model, + weight=self.primary_model_weight or 1.0, ) self.models.append(primary_model) @@ -171,6 +177,8 @@ def __post_init__(self): "retry_delay": self.retry_delay, "random_seed": self.random_seed, "reasoning_effort": self.reasoning_effort, + "enable_thinking": self.enable_thinking, + "thinking_budget": self.thinking_budget, } self.update_model_params(shared_config) @@ -191,7 +199,8 @@ def rebuild_models(self) -> None: if self.primary_model: # Create primary model primary_model = LLMModelConfig( - name=self.primary_model, weight=self.primary_model_weight or 1.0 + name=self.primary_model, + weight=self.primary_model_weight or 1.0, ) self.models.append(primary_model) @@ -224,6 +233,8 @@ def rebuild_models(self) -> None: "retry_delay": self.retry_delay, "random_seed": self.random_seed, "reasoning_effort": self.reasoning_effort, + "enable_thinking": self.enable_thinking, + "thinking_budget": self.thinking_budget, } self.update_model_params(shared_config) diff --git a/openevolve/database.py b/openevolve/database.py index d39792c0c..1bc1e7702 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -1002,17 +1002,17 @@ def _llm_judge_novelty(self, program: Program, similar_program: Program) -> bool messages=[{"role": "user", "content": user_msg}], ), ) - content: str = future.result() + content, _reasoning = future.result() except RuntimeError: # No event loop running, safe to use asyncio.run() - content: str = asyncio.run( + content, _reasoning = asyncio.run( self.novelty_llm.generate_with_context( system_message=NOVELTY_SYSTEM_MSG, messages=[{"role": "user", "content": user_msg}], ) ) - if content is None or content is None: + if content is None: logger.warning("Novelty LLM returned empty response") return True @@ -2519,6 +2519,7 @@ def log_prompt( template_key: str, prompt: Dict[str, str], responses: Optional[List[str]] = None, + reasonings: Optional[List[str]] = None, ) -> None: """ Log a prompt for a program. @@ -2538,6 +2539,10 @@ def log_prompt( responses = [] prompt["responses"] = responses + # Optionally store model reasoning traces + if reasonings: + prompt["reasonings"] = reasonings + if self.prompts_by_program is None: self.prompts_by_program = {} diff --git a/openevolve/iteration.py b/openevolve/iteration.py index b2347e006..d2b9b10fc 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -77,25 +77,25 @@ async def run_iteration_with_shared_db( iteration_start = time.time() # Generate code modification - llm_response = await llm_ensemble.generate_with_context( + llm_content, llm_reasoning = await llm_ensemble.generate_with_context( system_message=prompt["system"], messages=[{"role": "user", "content": prompt["user"]}], ) # Parse the response if config.diff_based_evolution: - diff_blocks = extract_diffs(llm_response, config.diff_pattern) + diff_blocks = extract_diffs(llm_content, config.diff_pattern) if not diff_blocks: logger.warning(f"Iteration {iteration+1}: No valid diffs found in response") return None # Apply the diffs - child_code = apply_diff(parent.code, llm_response, config.diff_pattern) + child_code = apply_diff(parent.code, llm_content, config.diff_pattern) changes_summary = format_diff_summary(diff_blocks) else: # Parse full rewrite - new_code = parse_full_rewrite(llm_response, config.language) + new_code = parse_full_rewrite(llm_content, config.language) if not new_code: logger.warning(f"Iteration {iteration+1}: No valid code found in response") @@ -141,6 +141,7 @@ async def run_iteration_with_shared_db( "system": prompt["system"], "user": prompt["user"], "responses": [llm_response] if llm_response is not None else [], + "reasonings": [llm_reasoning] if llm_reasoning is not None else [], } } if database.config.log_prompts diff --git a/openevolve/llm/base.py b/openevolve/llm/base.py index 1cbf436e9..769fbc8f1 100644 --- a/openevolve/llm/base.py +++ b/openevolve/llm/base.py @@ -10,13 +10,13 @@ class LLMInterface(ABC): """Abstract base class for LLM interfaces""" @abstractmethod - async def generate(self, prompt: str, **kwargs) -> str: + async def generate(self, prompt: str, **kwargs) -> tuple[str, Optional[str]]: """Generate text from a prompt""" pass @abstractmethod async def generate_with_context( self, system_message: str, messages: List[Dict[str, str]], **kwargs - ) -> str: + ) -> tuple[str, Optional[str]]: """Generate text using a system message and conversational context""" pass diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py index e3c471673..53c576472 100644 --- a/openevolve/llm/ensemble.py +++ b/openevolve/llm/ensemble.py @@ -55,14 +55,14 @@ def __init__(self, models_cfg: List[LLMModelConfig]): ) logger._ensemble_logged = True - async def generate(self, prompt: str, **kwargs) -> str: + async def generate(self, prompt: str, **kwargs) -> Tuple[str, Optional[str]]: """Generate text using a randomly selected model based on weights""" model = self._sample_model() return await model.generate(prompt, **kwargs) async def generate_with_context( self, system_message: str, messages: List[Dict[str, str]], **kwargs - ) -> str: + ) -> Tuple[str, Optional[str]]: """Generate text using a system message and conversational context""" model = self._sample_model() return await model.generate_with_context(system_message, messages, **kwargs) @@ -74,21 +74,21 @@ def _sample_model(self) -> LLMInterface: logger.info(f"Sampled model: {vars(sampled_model)['model']}") return sampled_model - async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]: + async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[Tuple[str, Optional[str]]]: """Generate multiple texts in parallel""" tasks = [self.generate(prompt, **kwargs) for _ in range(n)] return await asyncio.gather(*tasks) - async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]: + async def parallel_generate(self, prompts: List[str], **kwargs) -> List[Tuple[str, Optional[str]]]: """Generate responses for multiple prompts in parallel""" tasks = [self.generate(prompt, **kwargs) for prompt in prompts] return await asyncio.gather(*tasks) async def generate_all_with_context( self, system_message: str, messages: List[Dict[str, str]], **kwargs - ) -> str: - """Generate text using a all available models and average their returned metrics""" - responses = [] + ) -> List[Tuple[str, Optional[str]]]: + """Generate text using all available models and collect their responses""" + responses: List[Tuple[str, Optional[str]]] = [] for model in self.models: responses.append(await model.generate_with_context(system_message, messages, **kwargs)) return responses diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py index 4f86f9bb9..90a7b5b24 100644 --- a/openevolve/llm/openai.py +++ b/openevolve/llm/openai.py @@ -34,6 +34,8 @@ def __init__( self.api_key = model_cfg.api_key self.random_seed = getattr(model_cfg, "random_seed", None) self.reasoning_effort = getattr(model_cfg, "reasoning_effort", None) + self.enable_thinking = getattr(model_cfg, "enable_thinking", None) + self.thinking_budget = getattr(model_cfg, "thinking_budget", None) # Set up API client # OpenAI client requires max_retries to be int, not None @@ -53,7 +55,7 @@ def __init__( logger.info(f"Initialized OpenAI LLM with model: {self.model}") logger._initialized_models.add(self.model) - async def generate(self, prompt: str, **kwargs) -> str: + async def generate(self, prompt: str, **kwargs) -> tuple[str, Optional[str]]: """Generate text from a prompt""" return await self.generate_with_context( system_message=self.system_message, @@ -63,7 +65,7 @@ async def generate(self, prompt: str, **kwargs) -> str: async def generate_with_context( self, system_message: str, messages: List[Dict[str, str]], **kwargs - ) -> str: + ) -> tuple[str, Optional[str]]: """Generate text using a system message and conversational context""" # Prepare messages with system message formatted_messages = [{"role": "system", "content": system_message}] @@ -120,6 +122,30 @@ async def generate_with_context( if reasoning_effort is not None: params["reasoning_effort"] = reasoning_effort + # Attach provider-specific extras such as enable_thinking/thinking_budget + extra_body = dict(kwargs.get("extra_body") or {}) + enable_thinking = kwargs.get("enable_thinking", self.enable_thinking) + thinking_budget = kwargs.get("thinking_budget", self.thinking_budget) + + if enable_thinking is not None: + extra_body.setdefault("enable_thinking", enable_thinking) + if thinking_budget is not None: + extra_body.setdefault("thinking_budget", thinking_budget) + else: + # Warn once per model if thinking_budget is provided without enable_thinking + if thinking_budget is not None: + if not hasattr(OpenAILLM, "_warned_budget_without_thinking"): + OpenAILLM._warned_budget_without_thinking = set() + if self.model not in OpenAILLM._warned_budget_without_thinking: + logger.warning( + "thinking_budget set for model %s without enable_thinking; skipping extra_body", + self.model, + ) + OpenAILLM._warned_budget_without_thinking.add(self.model) + + if extra_body: + params["extra_body"] = extra_body + # Add seed parameter for reproducibility if configured # Skip seed parameter for Google AI Studio endpoint as it doesn't support it seed = kwargs.get("seed", self.random_seed) @@ -139,8 +165,8 @@ async def generate_with_context( for attempt in range(retries + 1): try: - response = await asyncio.wait_for(self._call_api(params), timeout=timeout) - return response + content, reasoning_content = await asyncio.wait_for(self._call_api(params), timeout=timeout) + return content, reasoning_content except asyncio.TimeoutError: if attempt < retries: logger.warning(f"Timeout on attempt {attempt + 1}/{retries + 1}. Retrying...") @@ -158,15 +184,29 @@ async def generate_with_context( logger.error(f"All {retries + 1} attempts failed with error: {str(e)}") raise - async def _call_api(self, params: Dict[str, Any]) -> str: + # Safety net to satisfy type checkers; the loop above always returns or raises + raise RuntimeError("Failed to generate completion after retries") + + async def _call_api(self, params: Dict[str, Any]) -> tuple[str, Optional[str]]: """Make the actual API call""" # Use asyncio to run the blocking API call in a thread pool loop = asyncio.get_event_loop() response = await loop.run_in_executor( None, lambda: self.client.chat.completions.create(**params) ) + message = response.choices[0].message + + content = message.content # Logging of system prompt, user message and response content logger = logging.getLogger(__name__) logger.debug(f"API parameters: {params}") - logger.debug(f"API response: {response.choices[0].message.content}") - return response.choices[0].message.content + logger.debug(f"API response: {content}") + + # Extract reasoning content if available + reasoning_content = None + if hasattr(message, "reasoning_content"): + logger.debug(f"API reasoning content: {message.reasoning_content}") + reasoning_content = message.reasoning_content + print(reasoning_content) + + return content, reasoning_content diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index 59a4a6b68..eaa12e762 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -30,6 +30,7 @@ class SerializableResult: iteration_time: float = 0.0 prompt: Optional[Dict[str, str]] = None llm_response: Optional[str] = None + llm_reasoning: Optional[str] = None artifacts: Optional[Dict[str, Any]] = None iteration: int = 0 error: Optional[str] = None @@ -186,7 +187,7 @@ def _run_iteration_worker( # Generate code modification (sync wrapper for async) try: - llm_response = asyncio.run( + llm_content, llm_reasoning = asyncio.run( _worker_llm_ensemble.generate_with_context( system_message=prompt["system"], messages=[{"role": "user", "content": prompt["user"]}], @@ -197,25 +198,25 @@ def _run_iteration_worker( return SerializableResult(error=f"LLM generation failed: {str(e)}", iteration=iteration) # Check for None response - if llm_response is None: + if llm_content is None: return SerializableResult(error="LLM returned None response", iteration=iteration) # Parse response based on evolution mode if _worker_config.diff_based_evolution: from openevolve.utils.code_utils import apply_diff, extract_diffs, format_diff_summary - diff_blocks = extract_diffs(llm_response, _worker_config.diff_pattern) + diff_blocks = extract_diffs(llm_content, _worker_config.diff_pattern) if not diff_blocks: return SerializableResult( error=f"No valid diffs found in response", iteration=iteration ) - child_code = apply_diff(parent.code, llm_response, _worker_config.diff_pattern) + child_code = apply_diff(parent.code, llm_content, _worker_config.diff_pattern) changes_summary = format_diff_summary(diff_blocks) else: from openevolve.utils.code_utils import parse_full_rewrite - new_code = parse_full_rewrite(llm_response, _worker_config.language) + new_code = parse_full_rewrite(llm_content, _worker_config.language) if not new_code: return SerializableResult( error=f"No valid code found in response", iteration=iteration @@ -258,12 +259,17 @@ def _run_iteration_worker( iteration_time = time.time() - iteration_start + # Propagate response and reasoning to controller for logging/persistence + llm_response = llm_content + llm_reasoning = llm_reasoning + return SerializableResult( child_program_dict=child_program.to_dict(), parent_id=parent.id, iteration_time=iteration_time, prompt=prompt, llm_response=llm_response, + llm_reasoning=llm_reasoning, artifacts=artifacts, iteration=iteration, ) @@ -534,6 +540,9 @@ async def run_evolution( program_id=child_program.id, prompt=result.prompt, responses=[result.llm_response] if result.llm_response else [], + reasonings=[result.llm_reasoning] + if getattr(result, "llm_reasoning", None) + else None, ) # Island management