Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions extensions/dynamic_content.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
// Package extensions implements various helper addons for Colly
package extensions

import (
"bytes"
"context"
"io"
"net/http"
"time"

"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/launcher"
"github.com/go-rod/rod/lib/proto"
"github.com/gocolly/colly/v2"
)

// DynamicContentOptions contains options for the DynamicContent extension
type DynamicContentOptions struct {
// Timeout is the maximum time to wait for the page to load
Timeout time.Duration
// WaitForSelector is a CSS selector to wait for before considering the page loaded
WaitForSelector string
// UserAgent is the user agent to use for the headless browser
UserAgent string
// Headless determines whether to run the browser in headless mode
Headless bool
// CustomBrowserPath is the path to a custom browser executable
CustomBrowserPath string
// ExtraHeaders are additional headers to send with each request
ExtraHeaders map[string]string
}

// DefaultDynamicContentOptions returns the default options for DynamicContent
func DefaultDynamicContentOptions() *DynamicContentOptions {
return &DynamicContentOptions{
Timeout: 30 * time.Second,
WaitForSelector: "body",
UserAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
Headless: true,
ExtraHeaders: make(map[string]string),
}
}

// dynamicContentTransport is a custom http.RoundTripper that uses Rod to render JavaScript
type dynamicContentTransport struct {
browser *rod.Browser
options *DynamicContentOptions
nextTransport http.RoundTripper
}

// RoundTrip implements the http.RoundTripper interface
func (t *dynamicContentTransport) RoundTrip(req *http.Request) (*http.Response, error) {
// Only process GET requests
if req.Method != "GET" {
return t.nextTransport.RoundTrip(req)
}

// Create a new page
page := t.browser.MustPage()
defer page.Close()

// Set user agent
if t.options.UserAgent != "" {
page.MustSetUserAgent(&proto.NetworkSetUserAgentOverride{
UserAgent: t.options.UserAgent,
})
}

// Set extra headers
if len(t.options.ExtraHeaders) > 0 {
extraHeaders := []string{}
for name, value := range t.options.ExtraHeaders {
extraHeaders = append(extraHeaders, name, value)
}
page.MustSetExtraHeaders(extraHeaders...)
}

// Create a context with timeout
ctx, cancel := context.WithTimeout(context.Background(), t.options.Timeout)
defer cancel()

// Navigate to the URL
err := page.Context(ctx).Navigate(req.URL.String())
if err != nil {
// Fall back to the original transport if navigation fails
return t.nextTransport.RoundTrip(req)
}

// Wait for the page to load
if t.options.WaitForSelector != "" {
err = page.Context(ctx).WaitElementsMoreThan(t.options.WaitForSelector, 0)
if err != nil {
// Fall back to the original transport if waiting fails
return t.nextTransport.RoundTrip(req)
}
}

// Get the HTML content
html, err := page.HTML()
if err != nil {
// Fall back to the original transport if getting HTML fails
return t.nextTransport.RoundTrip(req)
}

// Create a response with the rendered HTML
htmlBytes := []byte(html)
resp := &http.Response{
StatusCode: 200,
Body: io.NopCloser(bytes.NewReader(htmlBytes)),
Header: make(http.Header),
Request: req,
}

// Set content type header
resp.Header.Set("Content-Type", "text/html; charset=utf-8")
resp.ContentLength = int64(len(htmlBytes))

return resp, nil
}

// DynamicContent enables JavaScript rendering for Colly using the Rod headless browser
func DynamicContent(c *colly.Collector, options *DynamicContentOptions) {
if options == nil {
options = DefaultDynamicContentOptions()
}

// Initialize browser launcher
var browser *rod.Browser

// Create a new browser instance
launcherURL := launcher.New().
Headless(options.Headless).
Set("disable-web-security", "true").
Set("disable-setuid-sandbox", "true").
Set("no-sandbox", "true")

if options.CustomBrowserPath != "" {
launcherURL = launcherURL.Bin(options.CustomBrowserPath)
}

browserURL := launcherURL.MustLaunch()
browser = rod.New().ControlURL(browserURL).MustConnect()

// Create a transport that will intercept requests and use Rod to render them
originalTransport := http.DefaultTransport
c.WithTransport(&dynamicContentTransport{
browser: browser,
options: options,
nextTransport: originalTransport,
})
}
7 changes: 6 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ require (
github.com/PuerkitoBio/goquery v1.10.2
github.com/antchfx/htmlquery v1.3.4
github.com/antchfx/xmlquery v1.4.4
github.com/go-rod/rod v0.116.2
github.com/gobwas/glob v0.2.3
github.com/gocolly/colly v1.2.0
github.com/jawher/mow.cli v1.1.0
github.com/kennygrant/sanitize v1.2.4
github.com/nlnwa/whatwg-url v0.6.1
Expand All @@ -25,6 +25,11 @@ require (
github.com/bits-and-blooms/bitset v1.22.0 // indirect
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/ysmood/fetchup v0.2.3 // indirect
github.com/ysmood/goob v0.4.0 // indirect
github.com/ysmood/got v0.40.0 // indirect
github.com/ysmood/gson v0.7.3 // indirect
github.com/ysmood/leakless v0.9.0 // indirect
golang.org/x/text v0.23.0 // indirect
google.golang.org/protobuf v1.36.6 // indirect
)
18 changes: 16 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-rod/rod v0.116.2 h1:A5t2Ky2A+5eD/ZJQr1EfsQSe5rms5Xof/qj296e+ZqA=
github.com/go-rod/rod v0.116.2/go.mod h1:H+CMO9SCNc2TJ2WfrG+pKhITz57uGNYU43qYHh438Mg=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ=
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw=
Expand All @@ -44,6 +44,20 @@ github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/ysmood/fetchup v0.2.3 h1:ulX+SonA0Vma5zUFXtv52Kzip/xe7aj4vqT5AJwQ+ZQ=
github.com/ysmood/fetchup v0.2.3/go.mod h1:xhibcRKziSvol0H1/pj33dnKrYyI2ebIvz5cOOkYGns=
github.com/ysmood/goob v0.4.0 h1:HsxXhyLBeGzWXnqVKtmT9qM7EuVs/XOgkX7T6r1o1AQ=
github.com/ysmood/goob v0.4.0/go.mod h1:u6yx7ZhS4Exf2MwciFr6nIM8knHQIE22lFpWHnfql18=
github.com/ysmood/gop v0.2.0 h1:+tFrG0TWPxT6p9ZaZs+VY+opCvHU8/3Fk6BaNv6kqKg=
github.com/ysmood/gop v0.2.0/go.mod h1:rr5z2z27oGEbyB787hpEcx4ab8cCiPnKxn0SUHt6xzk=
github.com/ysmood/got v0.40.0 h1:ZQk1B55zIvS7zflRrkGfPDrPG3d7+JOza1ZkNxcc74Q=
github.com/ysmood/got v0.40.0/go.mod h1:W7DdpuX6skL3NszLmAsC5hT7JAhuLZhByVzHTq874Qg=
github.com/ysmood/gotrace v0.6.0 h1:SyI1d4jclswLhg7SWTL6os3L1WOKeNn/ZtzVQF8QmdY=
github.com/ysmood/gotrace v0.6.0/go.mod h1:TzhIG7nHDry5//eYZDYcTzuJLYQIkykJzCRIo4/dzQM=
github.com/ysmood/gson v0.7.3 h1:QFkWbTH8MxyUTKPkVWAENJhxqdBa4lYTQWqZCiLG6kE=
github.com/ysmood/gson v0.7.3/go.mod h1:3Kzs5zDl21g5F/BlLTNcuAGAYLKt2lV5G8D1zF3RNmg=
github.com/ysmood/leakless v0.9.0 h1:qxCG5VirSBvmi3uynXFkcnLMzkphdh3xx5FtrORwDCU=
github.com/ysmood/leakless v0.9.0/go.mod h1:R8iAXPRaG97QJwqxs74RdwzcRHT1SWCGTNqY8q0JvMQ=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
Expand Down