Skip to content

Commit ca1c88b

Browse files
committed
feat: add agentic scraper integration
1 parent 104faab commit ca1c88b

File tree

5 files changed

+518
-1
lines changed

5 files changed

+518
-1
lines changed

examples/agentic_scraper_tool.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import json
2+
3+
from scrapegraph_py.logger import sgai_logger
4+
5+
from langchain_scrapegraph.tools import AgenticScraperTool
6+
7+
sgai_logger.set_logging(level="INFO")
8+
9+
# Will automatically get SGAI_API_KEY from environment
10+
tool = AgenticScraperTool()
11+
12+
# Example 1: Basic usage with form filling and navigation
13+
print("=== Example 1: Basic Form Filling ===")
14+
url = "https://example.com/login"
15+
steps = [
16+
"Type '[email protected]' in email input box",
17+
"Type 'password123' in password input box",
18+
"Click on login button",
19+
]
20+
21+
try:
22+
result = tool.invoke({"url": url, "steps": steps, "use_session": True})
23+
print(json.dumps(result, indent=2))
24+
except Exception as e:
25+
print(f"Error: {e}")
26+
27+
print("\n" + "=" * 50 + "\n")
28+
29+
# Example 2: With AI extraction and structured output
30+
print("=== Example 2: AI Extraction with Schema ===")
31+
dashboard_url = "https://dashboard.example.com"
32+
dashboard_steps = [
33+
"Navigate to user profile section",
34+
"Click on settings tab",
35+
"Wait for page to load",
36+
]
37+
38+
# Define the output schema for structured data
39+
output_schema = {
40+
"user_info": {
41+
"type": "object",
42+
"properties": {
43+
"username": {"type": "string"},
44+
"email": {"type": "string"},
45+
"dashboard_sections": {"type": "array", "items": {"type": "string"}},
46+
"available_settings": {"type": "array", "items": {"type": "string"}},
47+
},
48+
}
49+
}
50+
51+
try:
52+
result = tool.invoke(
53+
{
54+
"url": dashboard_url,
55+
"steps": dashboard_steps,
56+
"ai_extraction": True,
57+
"user_prompt": "Extract user profile information and available dashboard sections and settings",
58+
"output_schema": output_schema,
59+
"use_session": True,
60+
}
61+
)
62+
print(json.dumps(result, indent=2))
63+
except Exception as e:
64+
print(f"Error: {e}")
65+
66+
print("\n" + "=" * 50 + "\n")
67+
68+
# Example 3: E-commerce product search
69+
print("=== Example 3: E-commerce Product Search ===")
70+
ecommerce_url = "https://shop.example.com"
71+
search_steps = [
72+
"Type 'laptop' in search input box",
73+
"Click on search button",
74+
"Wait for results to load",
75+
"Click on first product",
76+
]
77+
78+
search_schema = {
79+
"product_info": {
80+
"type": "object",
81+
"properties": {
82+
"product_name": {"type": "string"},
83+
"price": {"type": "string"},
84+
"description": {"type": "string"},
85+
"availability": {"type": "string"},
86+
},
87+
}
88+
}
89+
90+
try:
91+
result = tool.invoke(
92+
{
93+
"url": ecommerce_url,
94+
"steps": search_steps,
95+
"ai_extraction": True,
96+
"user_prompt": "Extract product information including name, price, description, and availability",
97+
"output_schema": search_schema,
98+
"use_session": True,
99+
}
100+
)
101+
print(json.dumps(result, indent=2))
102+
except Exception as e:
103+
print(f"Error: {e}")
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import json
2+
3+
from pydantic import BaseModel, Field
4+
from scrapegraph_py.logger import sgai_logger
5+
6+
from langchain_scrapegraph.tools import AgenticScraperTool
7+
8+
sgai_logger.set_logging(level="INFO")
9+
10+
11+
# Define output schemas for different use cases
12+
class UserProfileInfo(BaseModel):
13+
username: str = Field(description="The user's username")
14+
email: str = Field(description="The user's email address")
15+
dashboard_sections: list[str] = Field(description="Available dashboard sections")
16+
available_settings: list[str] = Field(description="Available user settings")
17+
18+
19+
class ProductInfo(BaseModel):
20+
product_name: str = Field(description="The name of the product")
21+
price: str = Field(description="The price of the product")
22+
description: str = Field(description="Product description")
23+
availability: str = Field(description="Product availability status")
24+
rating: float = Field(description="Product rating out of 5")
25+
26+
27+
class LoginResult(BaseModel):
28+
success: bool = Field(description="Whether login was successful")
29+
error_message: str = Field(description="Error message if login failed", default="")
30+
redirect_url: str = Field(description="URL to redirect to after login", default="")
31+
32+
33+
# Initialize the tool with different schemas for different use cases
34+
print("=== Example 1: User Profile Extraction with Schema ===")
35+
tool_with_profile_schema = AgenticScraperTool(llm_output_schema=UserProfileInfo)
36+
37+
dashboard_url = "https://dashboard.example.com"
38+
dashboard_steps = [
39+
"Navigate to user profile section",
40+
"Click on settings tab",
41+
"Wait for page to load",
42+
]
43+
44+
try:
45+
result = tool_with_profile_schema.invoke(
46+
{
47+
"url": dashboard_url,
48+
"steps": dashboard_steps,
49+
"ai_extraction": True,
50+
"user_prompt": "Extract user profile information and available dashboard sections and settings",
51+
"use_session": True,
52+
}
53+
)
54+
print("User Profile Result:")
55+
print(json.dumps(result, indent=2))
56+
except Exception as e:
57+
print(f"Error: {e}")
58+
59+
print("\n" + "=" * 50 + "\n")
60+
61+
print("=== Example 2: Product Information Extraction with Schema ===")
62+
tool_with_product_schema = AgenticScraperTool(llm_output_schema=ProductInfo)
63+
64+
ecommerce_url = "https://shop.example.com"
65+
search_steps = [
66+
"Type 'laptop' in search input box",
67+
"Click on search button",
68+
"Wait for results to load",
69+
"Click on first product",
70+
]
71+
72+
try:
73+
result = tool_with_product_schema.invoke(
74+
{
75+
"url": ecommerce_url,
76+
"steps": search_steps,
77+
"ai_extraction": True,
78+
"user_prompt": "Extract product information including name, price, description, availability, and rating",
79+
"use_session": True,
80+
}
81+
)
82+
print("Product Info Result:")
83+
print(json.dumps(result, indent=2))
84+
except Exception as e:
85+
print(f"Error: {e}")
86+
87+
print("\n" + "=" * 50 + "\n")
88+
89+
print("=== Example 3: Login Process with Schema ===")
90+
tool_with_login_schema = AgenticScraperTool(llm_output_schema=LoginResult)
91+
92+
login_url = "https://example.com/login"
93+
login_steps = [
94+
"Type '[email protected]' in email input box",
95+
"Type 'password123' in password input box",
96+
"Click on login button",
97+
"Wait for response",
98+
]
99+
100+
try:
101+
result = tool_with_login_schema.invoke(
102+
{
103+
"url": login_url,
104+
"steps": login_steps,
105+
"ai_extraction": True,
106+
"user_prompt": "Determine if login was successful and extract any error messages or redirect URLs",
107+
"use_session": True,
108+
}
109+
)
110+
print("Login Result:")
111+
print(json.dumps(result, indent=2))
112+
except Exception as e:
113+
print(f"Error: {e}")
114+
115+
print("\n" + "=" * 50 + "\n")
116+
117+
# Example 4: Using dictionary schema instead of Pydantic model
118+
print("=== Example 4: Dictionary Schema ===")
119+
tool_with_dict_schema = AgenticScraperTool()
120+
121+
# Define schema as a dictionary
122+
news_schema = {
123+
"news_article": {
124+
"type": "object",
125+
"properties": {
126+
"headline": {"type": "string"},
127+
"author": {"type": "string"},
128+
"publish_date": {"type": "string"},
129+
"content_summary": {"type": "string"},
130+
"tags": {"type": "array", "items": {"type": "string"}},
131+
},
132+
}
133+
}
134+
135+
news_url = "https://news.example.com"
136+
news_steps = [
137+
"Navigate to latest news section",
138+
"Click on first article",
139+
"Wait for page to load",
140+
]
141+
142+
try:
143+
result = tool_with_dict_schema.invoke(
144+
{
145+
"url": news_url,
146+
"steps": news_steps,
147+
"ai_extraction": True,
148+
"user_prompt": "Extract article headline, author, publish date, content summary, and tags",
149+
"output_schema": news_schema,
150+
"use_session": True,
151+
}
152+
)
153+
print("News Article Result:")
154+
print(json.dumps(result, indent=2))
155+
except Exception as e:
156+
print(f"Error: {e}")

langchain_scrapegraph/tools/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
from .agentic_scraper import AgenticScraperTool
12
from .credits import GetCreditsTool
23
from .markdownify import MarkdownifyTool
34
from .searchscraper import SearchScraperTool
45
from .smartcrawler import SmartCrawlerTool
56
from .smartscraper import SmartScraperTool
67

78
__all__ = [
9+
"AgenticScraperTool",
810
"SmartScraperTool",
911
"SmartCrawlerTool",
1012
"GetCreditsTool",

0 commit comments

Comments
 (0)