Skip to content

Commit e4cc8fd

Browse files
committed
feat: add all the integrations
1 parent 844d307 commit e4cc8fd

File tree

8 files changed

+539
-18
lines changed

8 files changed

+539
-18
lines changed

.pre-commit-config.yaml

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,45 @@
11
repos:
22
- repo: https://github.com/psf/black
3-
rev: 24.8.0
3+
rev: 24.3.0 # Updated to latest stable
44
hooks:
55
- id: black
6+
language_version: python3.10
67

78
- repo: https://github.com/charliermarsh/ruff-pre-commit
8-
rev: v0.6.9
9+
rev: v0.3.4 # Updated to latest stable
910
hooks:
1011
- id: ruff
12+
args: [--fix, --exit-non-zero-on-fix]
1113

1214
- repo: https://github.com/pycqa/isort
13-
rev: 5.13.2
15+
rev: 5.13.2 # Latest stable
1416
hooks:
1517
- id: isort
1618

1719
- repo: https://github.com/pre-commit/pre-commit-hooks
18-
rev: v4.6.0
20+
rev: v4.5.0 # Updated to latest stable
1921
hooks:
2022
- id: trailing-whitespace
2123
- id: end-of-file-fixer
2224
- id: check-yaml
2325
exclude: mkdocs.yml
26+
- id: check-added-large-files
27+
- id: check-case-conflict
28+
- id: check-merge-conflict
29+
- id: detect-private-key
30+
- id: debug-statements
31+
32+
- repo: https://github.com/python-poetry/poetry
33+
rev: 1.8.0 # Latest stable
34+
hooks:
35+
- id: poetry-check
36+
- id: poetry-lock
37+
args: ["--check"]
38+
39+
- repo: https://github.com/pre-commit/mirrors-mypy
40+
rev: v1.9.0 # Latest stable
41+
hooks:
42+
- id: mypy
43+
additional_dependencies: [types-setuptools]
44+
args: [--ignore-missing-imports, --follow-imports=skip]
45+
exclude: ^(tests/|examples/)

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,41 @@ We offer SDKs in both Python and Node.js, making it easy to integrate into your
2222
pip install langchain-scrapegraph
2323
```
2424

25+
## 🔄 Development Setup
26+
27+
### Pre-commit Hooks
28+
29+
We use pre-commit hooks to ensure code quality and consistency. Install the pre-commit hooks with:
30+
31+
```bash
32+
pip install pre-commit
33+
pre-commit install
34+
```
35+
36+
The following hooks are configured:
37+
38+
- **black**: Code formatting
39+
- **ruff**: Fast Python linter
40+
- **isort**: Import sorting
41+
- **pre-commit-hooks**: Various checks including:
42+
- trailing whitespace
43+
- end of files
44+
- yaml validation
45+
- large files
46+
- merge conflicts
47+
- private keys
48+
- debug statements
49+
- **poetry**: Package management checks
50+
- poetry-check: Validate pyproject.toml
51+
- poetry-lock: Ensure poetry.lock is up to date
52+
- **mypy**: Static type checking
53+
54+
Run the hooks manually with:
55+
56+
```bash
57+
pre-commit run --all-files
58+
```
59+
2560
## 🛠️ Available Tools
2661

2762
### 📝 MarkdownifyTool

examples/scrape_example.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Basic synchronous example demonstrating how to use the Scrape API.
4+
5+
This example shows:
6+
1. How to make a basic scrape request
7+
2. How to use render_heavy_js for JavaScript-heavy websites
8+
3. How to add custom headers
9+
4. How to handle the response
10+
11+
Equivalent curl command:
12+
curl -X POST https://api.scrapegraphai.com/v1/scrape \
13+
-H "Content-Type: application/json" \
14+
-H "SGAI-APIKEY: your-api-key-here" \
15+
-d '{
16+
"website_url": "https://example.com",
17+
"render_heavy_js": false
18+
}'
19+
20+
Requirements:
21+
- Python 3.7+
22+
- scrapegraph-py
23+
- python-dotenv
24+
- A .env file with your SGAI_API_KEY
25+
26+
Example .env file:
27+
SGAI_API_KEY=your_api_key_here
28+
"""
29+
30+
import time
31+
from pathlib import Path
32+
33+
from dotenv import load_dotenv
34+
from scrapegraph_py import Client
35+
36+
# Load environment variables from .env file
37+
load_dotenv()
38+
39+
40+
def basic_scrape_example():
41+
"""Demonstrate basic scrape functionality."""
42+
print("🌐 Basic Scrape Example")
43+
print("=" * 30)
44+
45+
# Initialize client
46+
client = Client.from_env()
47+
48+
try:
49+
# Basic scrape request
50+
print("Making basic scrape request...")
51+
result = client.scrape(website_url="https://example.com", render_heavy_js=False)
52+
53+
# Display results
54+
html_content = result.get("html", "")
55+
print(f"✅ Success! Received {len(html_content):,} characters of HTML")
56+
print(f"Request ID: {result.get('request_id', 'N/A')}")
57+
58+
return result
59+
60+
except Exception as e:
61+
print(f"❌ Error: {str(e)}")
62+
return None
63+
finally:
64+
client.close()
65+
66+
67+
def scrape_with_heavy_js():
68+
"""Demonstrate scraping with heavy JavaScript rendering."""
69+
print("\n🚀 Heavy JavaScript Rendering Example")
70+
print("=" * 45)
71+
72+
client = Client.from_env()
73+
74+
try:
75+
print("Making scrape request with heavy JS rendering...")
76+
start_time = time.time()
77+
78+
result = client.scrape(
79+
website_url="https://example.com",
80+
render_heavy_js=True, # Enable JavaScript rendering
81+
)
82+
83+
execution_time = time.time() - start_time
84+
html_content = result.get("html", "")
85+
86+
print(f"✅ Success! Received {len(html_content):,} characters of HTML")
87+
print(f"⏱️ Execution time: {execution_time:.2f} seconds")
88+
print(f"Request ID: {result.get('request_id', 'N/A')}")
89+
90+
return result
91+
92+
except Exception as e:
93+
print(f"❌ Error: {str(e)}")
94+
return None
95+
finally:
96+
client.close()
97+
98+
99+
def scrape_with_custom_headers():
100+
"""Demonstrate scraping with custom headers."""
101+
print("\n🔧 Custom Headers Example")
102+
print("=" * 30)
103+
104+
client = Client.from_env()
105+
106+
# Custom headers for better compatibility
107+
custom_headers = {
108+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
109+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
110+
"Accept-Language": "en-US,en;q=0.5",
111+
"Accept-Encoding": "gzip, deflate, br",
112+
"Connection": "keep-alive",
113+
"Upgrade-Insecure-Requests": "1",
114+
}
115+
116+
try:
117+
print("Making scrape request with custom headers...")
118+
result = client.scrape(
119+
website_url="https://httpbin.org/html",
120+
render_heavy_js=False,
121+
headers=custom_headers,
122+
)
123+
124+
html_content = result.get("html", "")
125+
print(f"✅ Success! Received {len(html_content):,} characters of HTML")
126+
print(f"Request ID: {result.get('request_id', 'N/A')}")
127+
128+
# Show a preview of the HTML
129+
preview = html_content[:200].replace("\n", " ").strip()
130+
print(f"HTML Preview: {preview}...")
131+
132+
return result
133+
134+
except Exception as e:
135+
print(f"❌ Error: {str(e)}")
136+
return None
137+
finally:
138+
client.close()
139+
140+
141+
def save_html_to_file(html_content: str, filename: str):
142+
"""Save HTML content to a file."""
143+
output_dir = Path("scrape_output")
144+
output_dir.mkdir(exist_ok=True)
145+
146+
file_path = output_dir / f"{filename}.html"
147+
with open(file_path, "w", encoding="utf-8") as f:
148+
f.write(html_content)
149+
150+
print(f"💾 HTML saved to: {file_path}")
151+
return file_path
152+
153+
154+
def demonstrate_curl_equivalent():
155+
"""Show the equivalent curl commands."""
156+
print("\n🌐 Equivalent curl commands:")
157+
print("=" * 35)
158+
159+
print("1. Basic scrape:")
160+
print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\")
161+
print(' -H "Content-Type: application/json" \\')
162+
print(' -H "SGAI-APIKEY: your-api-key-here" \\')
163+
print(" -d '{")
164+
print(' "website_url": "https://example.com",')
165+
print(' "render_heavy_js": false')
166+
print(" }'")
167+
168+
print("\n2. With heavy JS rendering:")
169+
print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\")
170+
print(' -H "Content-Type: application/json" \\')
171+
print(' -H "SGAI-APIKEY: your-api-key-here" \\')
172+
print(" -d '{")
173+
print(' "website_url": "https://example.com",')
174+
print(' "render_heavy_js": true')
175+
print(" }'")
176+
177+
178+
def main():
179+
"""Main function demonstrating scrape functionality."""
180+
print("🚀 Scrape API Examples")
181+
print("=" * 25)
182+
183+
# Show curl equivalents first
184+
demonstrate_curl_equivalent()
185+
186+
try:
187+
# Run examples
188+
result1 = basic_scrape_example()
189+
result2 = scrape_with_heavy_js()
190+
result3 = scrape_with_custom_headers()
191+
192+
# Save results if successful
193+
if result1:
194+
html1 = result1.get("html", "")
195+
if html1:
196+
save_html_to_file(html1, "basic_scrape")
197+
198+
if result3:
199+
html3 = result3.get("html", "")
200+
if html3:
201+
save_html_to_file(html3, "custom_headers_scrape")
202+
203+
print("\n🎯 Summary:")
204+
print(f"✅ Basic scrape: {'Success' if result1 else 'Failed'}")
205+
print(f"✅ Heavy JS scrape: {'Success' if result2 else 'Failed'}")
206+
print(f"✅ Custom headers scrape: {'Success' if result3 else 'Failed'}")
207+
208+
except Exception as e:
209+
print(f"❌ Unexpected error: {str(e)}")
210+
211+
print("\n📚 Next steps:")
212+
print("• Try the curl commands in your terminal")
213+
print("• Experiment with different websites")
214+
print("• Test with your own custom headers")
215+
print("• Compare render_heavy_js=true vs false for dynamic sites")
216+
217+
218+
if __name__ == "__main__":
219+
main()

0 commit comments

Comments
 (0)