Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 6 additions & 27 deletions camel/societies/workforce/workforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,35 +117,14 @@ def __init__(

# Warning messages for default model usage
if coordinator_agent_kwargs is None:
logger.warning(
"No coordinator_agent_kwargs provided. Using default "
"ChatAgent settings (ModelPlatformType.DEFAULT, "
"ModelType.DEFAULT). To customize the coordinator agent "
"that assigns tasks and handles failures, pass a dictionary "
"with ChatAgent parameters, e.g.: {'model': your_model, "
"'tools': your_tools, 'token_limit': 8000}. See ChatAgent "
"documentation for all available options."
)
logger.info("Using default coordinator agent settings. " \
"Pass coordinator_agent_kwargs to customize.")
if task_agent_kwargs is None:
logger.warning(
"No task_agent_kwargs provided. Using default ChatAgent "
"settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT). "
"To customize the task planning agent that "
"decomposes/composes tasks, pass a dictionary with "
"ChatAgent parameters, e.g.: {'model': your_model, "
"'token_limit': 16000}. See ChatAgent documentation for "
"all available options."
)
logger.info("Using default task agent settings. " \
"Pass task_agent_kwargs to customize.")
if new_worker_agent_kwargs is None:
logger.warning(
"No new_worker_agent_kwargs provided. Workers created at "
"runtime will use default ChatAgent settings with "
"SearchToolkit, CodeExecutionToolkit, and ThinkingToolkit. "
"To customize runtime worker creation, pass a dictionary "
"with ChatAgent parameters, e.g.: {'model': your_model, "
"'tools': your_tools}. See ChatAgent documentation for all "
"available options."
)
logger.info("Using default worker settings with SearchToolkit, CodeExecutionToolkit, and ThinkingToolkit. " \
"Pass new_worker_agent_kwargs to customize.")

coord_agent_sys_msg = BaseMessage.make_assistant_message(
role_name="Workforce Manager",
Expand Down
172 changes: 164 additions & 8 deletions camel/utils/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def deduplicate_internally(

strategy is used to specify different strategies, where 'top1' selects the
one with highest similarity, and 'llm-supervise' uses LLM to determine if
texts are duplicates (not yet implemented).
texts are duplicates.

Args:
texts (List[str]): The list of texts to be deduplicated.
Expand Down Expand Up @@ -144,17 +144,11 @@ def deduplicate_internally(
unique_embeddings_dict={
0: embeddings[0]
if embeddings
else embedding_instance.embed_list(texts)[0] # type: ignore[union-attr]
else embedding_instance.embed_list(texts)[0] if embedding_instance else [0.0] # type: ignore[union-attr]
},
duplicate_to_target_map={},
)

if strategy == "llm-supervise":
# TODO: Implement LLM-supervise deduplication.
raise NotImplementedError(
"LLM-supervise deduplication is not yet implemented."
)

# Check if the parameters are valid.
if not 0 <= threshold <= 1:
raise ValueError("Threshold must be between 0 and 1")
Expand All @@ -169,6 +163,15 @@ def deduplicate_internally(
"Please choose only one way to supply embeddings."
)

if strategy == "llm-supervise":
return _deduplicate_with_llm_supervision(
texts=texts,
threshold=threshold,
embedding_instance=embedding_instance,
embeddings=embeddings,
batch_size=batch_size,
)

if embedding_instance is not None:
# Use Camel's embedding_instance to vectorize.
embeddings = embedding_instance.embed_list(texts)
Expand Down Expand Up @@ -230,3 +233,156 @@ def deduplicate_internally(
unique_embeddings_dict=unique_embeddings_dict,
duplicate_to_target_map=duplicate_to_target_map,
)


def _deduplicate_with_llm_supervision(
texts: List[str],
threshold: float,
embedding_instance: Optional[BaseEmbedding[str]],
embeddings: Optional[List[List[float]]],
batch_size: int,
) -> DeduplicationResult:
r"""Deduplicate texts using LLM supervision.

This function uses embeddings to find potential duplicates, then uses an LLM
to determine if they are actually duplicates.

Args:
texts: List of texts to deduplicate
threshold: Similarity threshold for initial filtering
embedding_instance: Embedding instance for computing embeddings
embeddings: Pre-computed embeddings
batch_size: Batch size for processing

Returns:
DeduplicationResult with LLM-supervised deduplication results
"""
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# First, get embeddings if not provided
if embedding_instance is not None:
embeddings = embedding_instance.embed_list(texts)
elif embeddings is None:
raise ValueError(
"Either 'embedding_instance' or 'embeddings' must be provided."
)

if len(embeddings) != len(texts):
raise ValueError(
"The length of 'embeddings' does not match the length of 'texts'."
)

# Convert embeddings to numpy array
embeddings_array = np.array(embeddings)
n = len(texts)
duplicate_to_target_map: Dict[int, int] = {}

# Find potential duplicates using cosine similarity
potential_duplicates = []

for i in range(0, n, batch_size):
batch_end = min(i + batch_size, n)
batch_similarities = cosine_similarity(
embeddings_array[i:batch_end], embeddings_array[:batch_end]
)

# Create mask for lower triangle
tril_mask = np.tril(np.ones_like(batch_similarities), k=-1)
batch_similarities = batch_similarities * tril_mask

# Find pairs above threshold
for j in range(batch_end - i):
for k in range(j):
if batch_similarities[j, k] > threshold:
potential_duplicates.append((i + j, k))

# Use LLM to determine actual duplicates
if potential_duplicates:
duplicate_pairs = _llm_judge_duplicates(texts, potential_duplicates)

# Build duplicate map
for duplicate_idx, target_idx in duplicate_pairs:
duplicate_to_target_map[duplicate_idx] = target_idx

# Get unique ids and embeddings
unique_ids = []
unique_embeddings_dict = {}

for i, (_, emb) in enumerate(zip(texts, embeddings)):
if i not in duplicate_to_target_map:
unique_ids.append(i)
unique_embeddings_dict[i] = emb

return DeduplicationResult(
original_texts=texts,
unique_ids=unique_ids,
unique_embeddings_dict=unique_embeddings_dict,
duplicate_to_target_map=duplicate_to_target_map,
)


def _llm_judge_duplicates(
texts: List[str],
potential_duplicates: List[tuple[int, int]]
) -> List[tuple[int, int]]:
r"""Use LLM to judge if potential duplicate pairs are actually duplicates.

Args:
texts: List of all texts
potential_duplicates: List of (duplicate_idx, target_idx) pairs

Returns:
List of (duplicate_idx, target_idx) pairs that LLM judged as duplicates
"""
try:
# Import here to avoid circular import issues
from camel.models import ModelFactory
from camel.types import ModelPlatformType
from camel.types.enums import ModelType

# Create a simple LLM model for judgment
# Using a lightweight model for efficiency
llm_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_3_5_TURBO,
)

actual_duplicates = []

for duplicate_idx, target_idx in potential_duplicates:
text1 = texts[duplicate_idx]
text2 = texts[target_idx]

# Create prompt for LLM judgment
prompt = f"""You are a text deduplication expert. Your task is to determine if two texts are duplicates or near-duplicates.

Text 1: "{text1}"
Text 2: "{text2}"

Are these texts duplicates or near-duplicates? Consider:
- Semantic similarity
- Information overlap
- Whether they convey the same meaning

Respond with only "YES" if they are duplicates, or "NO" if they are not duplicates."""

try:
# Get LLM response
response = llm_model.run([{"role": "user", "content": prompt}])
judgment = response.choices[0].message.content.strip().upper()

if judgment == "YES":
actual_duplicates.append((duplicate_idx, target_idx))

except Exception as e:
# If LLM fails, default to keeping both texts (no deduplication)
print(f"LLM judgment failed for pair {duplicate_idx}-{target_idx}: {e}")
continue

return actual_duplicates

except Exception as e:
print(f"Failed to initialize LLM for deduplication: {e}")
# Fallback: return empty list (no deduplication)
return []
Loading