Skip to content

Commit 34b5f10

Browse files
committed
feat: added pydantic output schema 🔍
1 parent 03e49dc commit 34b5f10

File tree

5 files changed

+219
-16
lines changed

5 files changed

+219
-16
lines changed

README.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,39 @@ result = tool.invoke({
4444
print(result)
4545
```
4646

47+
<details>
48+
<summary>🔍 Using Output Schemas with SmartscraperTool</summary>
49+
50+
You can define the structure of the output using Pydantic models:
51+
52+
```python
53+
from typing import List
54+
from pydantic import BaseModel, Field
55+
from langchain_scrapegraph.tools import SmartscraperTool
56+
57+
class WebsiteInfo(BaseModel):
58+
title: str = Field(description="The main title of the webpage")
59+
description: str = Field(description="The main description or first paragraph")
60+
urls: List[str] = Field(description="The URLs inside the webpage")
61+
62+
# Initialize with schema
63+
tool = SmartscraperTool(llm_output_schema=WebsiteInfo)
64+
65+
# The output will conform to the WebsiteInfo schema
66+
result = tool.invoke({
67+
"website_url": "https://www.example.com",
68+
"user_prompt": "Extract the website information"
69+
})
70+
71+
print(result)
72+
# {
73+
# "title": "Example Domain",
74+
# "description": "This domain is for use in illustrative examples...",
75+
# "urls": ["https://www.iana.org/domains/example"]
76+
# }
77+
```
78+
</details>
79+
4780
### 💻 LocalscraperTool
4881
Extract information from HTML content using AI.
4982

@@ -59,6 +92,54 @@ result = tool.invoke({
5992
print(result)
6093
```
6194

95+
<details>
96+
<summary>🔍 Using Output Schemas with LocalscraperTool</summary>
97+
98+
You can define the structure of the output using Pydantic models:
99+
100+
```python
101+
from typing import Optional
102+
from pydantic import BaseModel, Field
103+
from langchain_scrapegraph.tools import LocalscraperTool
104+
105+
class CompanyInfo(BaseModel):
106+
name: str = Field(description="The company name")
107+
description: str = Field(description="The company description")
108+
email: Optional[str] = Field(description="Contact email if available")
109+
phone: Optional[str] = Field(description="Contact phone if available")
110+
111+
# Initialize with schema
112+
tool = LocalscraperTool(llm_output_schema=CompanyInfo)
113+
114+
html_content = """
115+
<html>
116+
<body>
117+
<h1>TechCorp Solutions</h1>
118+
<p>We are a leading AI technology company.</p>
119+
<div class="contact">
120+
<p>Email: [email protected]</p>
121+
<p>Phone: (555) 123-4567</p>
122+
</div>
123+
</body>
124+
</html>
125+
"""
126+
127+
# The output will conform to the CompanyInfo schema
128+
result = tool.invoke({
129+
"website_html": html_content,
130+
"user_prompt": "Extract the company information"
131+
})
132+
133+
print(result)
134+
# {
135+
# "name": "TechCorp Solutions",
136+
# "description": "We are a leading AI technology company.",
137+
# "email": "[email protected]",
138+
# "phone": "(555) 123-4567"
139+
# }
140+
```
141+
</details>
142+
62143
## 🌟 Key Features
63144

64145
- 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from typing import List
2+
3+
from pydantic import BaseModel, Field
4+
from scrapegraph_py.logger import sgai_logger
5+
6+
from langchain_scrapegraph.tools import LocalScraperTool
7+
8+
9+
class WebsiteInfo(BaseModel):
10+
title: str = Field(description="The main title of the webpage")
11+
description: str = Field(description="The main description or first paragraph")
12+
urls: List[str] = Field(description="The URLs inside the webpage")
13+
14+
15+
sgai_logger.set_logging(level="INFO")
16+
17+
# Initialize with Pydantic model class
18+
tool = LocalScraperTool(llm_output_schema=WebsiteInfo)
19+
20+
# Example website and prompt
21+
html_content = """
22+
<html>
23+
<body>
24+
<h1>Company Name</h1>
25+
<p>We are a technology company focused on AI solutions.</p>
26+
<div class="contact">
27+
<p>Email: [email protected]</p>
28+
<p>Phone: (555) 123-4567</p>
29+
</div>
30+
</body>
31+
</html>
32+
"""
33+
user_prompt = "Make a summary of the webpage and extract the email and phone number"
34+
35+
# Use the tool
36+
result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt})
37+
38+
print(result)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from typing import List
2+
3+
from pydantic import BaseModel, Field
4+
from scrapegraph_py.logger import sgai_logger
5+
6+
from langchain_scrapegraph.tools import SmartScraperTool
7+
8+
9+
class WebsiteInfo(BaseModel):
10+
title: str = Field(description="The main title of the webpage")
11+
description: str = Field(description="The main description or first paragraph")
12+
urls: List[str] = Field(description="The URLs inside the webpage")
13+
14+
15+
sgai_logger.set_logging(level="INFO")
16+
17+
# Initialize with Pydantic model class
18+
tool = SmartScraperTool(llm_output_schema=WebsiteInfo)
19+
20+
# Example website and prompt
21+
website_url = "https://www.example.com"
22+
user_prompt = "Extract info about the website"
23+
24+
# Use the tool - output will conform to WebsiteInfo schema
25+
result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt})
26+
print(result)

langchain_scrapegraph/tools/localscraper.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class LocalScraperTool(BaseTool):
3737
Key init args:
3838
api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
3939
client: Optional pre-configured ScrapeGraph client instance.
40+
llm_output_schema: Optional Pydantic model or dictionary schema to structure the output.
41+
If provided, the tool will ensure the output conforms to this schema.
4042
4143
Instantiate:
4244
.. code-block:: python
@@ -49,6 +51,16 @@ class LocalScraperTool(BaseTool):
4951
# Or provide API key directly
5052
tool = LocalScraperTool(api_key="your-api-key")
5153
54+
# Optionally, you can provide an output schema:
55+
from pydantic import BaseModel, Field
56+
57+
class CompanyInfo(BaseModel):
58+
name: str = Field(description="Company name")
59+
description: str = Field(description="Company description")
60+
email: str = Field(description="Contact email")
61+
62+
tool_with_schema = LocalScraperTool(llm_output_schema=CompanyInfo)
63+
5264
Use the tool:
5365
.. code-block:: python
5466
@@ -71,21 +83,21 @@ class LocalScraperTool(BaseTool):
7183
})
7284
7385
print(result)
86+
# Without schema:
7487
# {
7588
# "description": "We are a technology company focused on AI solutions",
7689
# "contact": {
7790
# "email": "[email protected]",
7891
# "phone": "(555) 123-4567"
7992
# }
8093
# }
81-
82-
Async usage:
83-
.. code-block:: python
84-
85-
result = await tool.ainvoke({
86-
"user_prompt": "Extract contact information",
87-
"website_html": html_content
88-
})
94+
#
95+
# With CompanyInfo schema:
96+
# {
97+
# "name": "Company Name",
98+
# "description": "We are a technology company focused on AI solutions",
99+
# "email": "[email protected]"
100+
# }
89101
"""
90102

91103
name: str = "LocalScraper"
@@ -96,6 +108,7 @@ class LocalScraperTool(BaseTool):
96108
return_direct: bool = True
97109
client: Optional[Client] = None
98110
api_key: str
111+
llm_output_schema: Optional[Type[BaseModel]] = None
99112

100113
@model_validator(mode="before")
101114
@classmethod
@@ -117,10 +130,23 @@ def _run(
117130
"""Use the tool to extract data from a website."""
118131
if not self.client:
119132
raise ValueError("Client not initialized")
120-
response = self.client.localscraper(
121-
website_html=website_html,
122-
user_prompt=user_prompt,
123-
)
133+
134+
if self.llm_output_schema is None:
135+
response = self.client.localscraper(
136+
website_html=website_html,
137+
user_prompt=user_prompt,
138+
)
139+
elif isinstance(self.llm_output_schema, type) and issubclass(
140+
self.llm_output_schema, BaseModel
141+
):
142+
response = self.client.localscraper(
143+
website_html=website_html,
144+
user_prompt=user_prompt,
145+
output_schema=self.llm_output_schema,
146+
)
147+
else:
148+
raise ValueError("llm_output_schema must be a Pydantic model class")
149+
124150
return response["result"]
125151

126152
async def _arun(

langchain_scrapegraph/tools/smartscraper.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class SmartScraperTool(BaseTool):
3737
Key init args:
3838
api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
3939
client: Optional pre-configured ScrapeGraph client instance.
40+
llm_output_schema: Optional Pydantic model or dictionary schema to structure the output.
41+
If provided, the tool will ensure the output conforms to this schema.
4042
4143
Instantiate:
4244
.. code-block:: python
@@ -49,6 +51,15 @@ class SmartScraperTool(BaseTool):
4951
# Or provide API key directly
5052
tool = SmartScraperTool(api_key="your-api-key")
5153
54+
# Optionally, you can provide an output schema:
55+
from pydantic import BaseModel, Field
56+
57+
class WebsiteInfo(BaseModel):
58+
title: str = Field(description="The main title")
59+
description: str = Field(description="The main description")
60+
61+
tool_with_schema = SmartScraperTool(llm_output_schema=WebsiteInfo)
62+
5263
Use the tool:
5364
.. code-block:: python
5465
@@ -58,10 +69,17 @@ class SmartScraperTool(BaseTool):
5869
})
5970
6071
print(result)
72+
# Without schema:
6173
# {
6274
# "main_heading": "Example Domain",
6375
# "first_paragraph": "This domain is for use in illustrative examples..."
6476
# }
77+
#
78+
# With WebsiteInfo schema:
79+
# {
80+
# "title": "Example Domain",
81+
# "description": "This domain is for use in illustrative examples..."
82+
# }
6583
6684
Async usage:
6785
.. code-block:: python
@@ -80,6 +98,7 @@ class SmartScraperTool(BaseTool):
8098
return_direct: bool = True
8199
client: Optional[Client] = None
82100
api_key: str
101+
llm_output_schema: Optional[Type[BaseModel]] = None
83102

84103
@model_validator(mode="before")
85104
@classmethod
@@ -101,10 +120,23 @@ def _run(
101120
"""Use the tool to extract data from a website."""
102121
if not self.client:
103122
raise ValueError("Client not initialized")
104-
response = self.client.smartscraper(
105-
website_url=website_url,
106-
user_prompt=user_prompt,
107-
)
123+
124+
if self.llm_output_schema is None:
125+
response = self.client.smartscraper(
126+
website_url=website_url,
127+
user_prompt=user_prompt,
128+
)
129+
elif isinstance(self.llm_output_schema, type) and issubclass(
130+
self.llm_output_schema, BaseModel
131+
):
132+
response = self.client.smartscraper(
133+
website_url=website_url,
134+
user_prompt=user_prompt,
135+
output_schema=self.llm_output_schema,
136+
)
137+
else:
138+
raise ValueError("llm_output_schema must be a Pydantic model class")
139+
108140
return response["result"]
109141

110142
async def _arun(

0 commit comments

Comments
 (0)