crawler

package
v0.27.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 23, 2026 License: MIT Imports: 23 Imported by: 0

Documentation

Index

Constants

View Source
const MaxBodySampleSize = 50 * 1024

MaxBodySampleSize is the maximum size of body sample stored for tech detection (50KB)

Variables

This section is empty.

Functions

func IsPathAllowed

func IsPathAllowed(rules *RobotsRules, path string) bool

IsPathAllowed checks if a path is allowed by robots.txt rules

Types

type CacheCheckAttempt

type CacheCheckAttempt struct {
	Attempt     int    `json:"attempt"`
	CacheStatus string `json:"cache_status"`
	Delay       int    `json:"delay_ms"`
}

CacheCheckAttempt stores the result of a single cache status check.

type Config

type Config struct {
	DefaultTimeout time.Duration // Default timeout for requests
	MaxConcurrency int           // Maximum number of concurrent requests
	RateLimit      int           // Determines request delay range: base=1s/RateLimit, range=base to 1s
	UserAgent      string        // User agent string for requests
	RetryAttempts  int           // Number of retry attempts for failed requests
	RetryDelay     time.Duration // Delay between retry attempts
	SkipCachedURLs bool          // Whether to skip URLs that are already cached (HIT)
	Port           string        // Server port
	Env            string        // Environment (development/production)
	LogLevel       string        // Logging level
	DatabaseURL    string        // Database connection URL
	AuthToken      string        // Database authentication token
	SentryDSN      string        // Sentry DSN for error tracking
	FindLinks      bool          // Whether to extract links (e.g. PDFs/docs) from pages
	SkipSSRFCheck  bool          // Skip SSRF protection (for tests only, never enable in production)
}

Config holds the configuration for a crawler instance

func DefaultConfig

func DefaultConfig() *Config

DefaultConfig returns a Config instance with default values

type CrawlOptions

type CrawlOptions struct {
	MaxPages    int  // Maximum pages to crawl
	Concurrency int  // Number of concurrent crawlers
	RateLimit   int  // Maximum requests per second
	Timeout     int  // Request timeout in seconds
	FollowLinks bool // Whether to follow links on crawled pages
}

CrawlOptions defines configuration options for a crawl operation

type CrawlResult

type CrawlResult struct {
	URL                 string              `json:"url"`
	ResponseTime        int64               `json:"response_time"`
	StatusCode          int                 `json:"status_code"`
	Error               string              `json:"error,omitempty"`
	Warning             string              `json:"warning,omitempty"`
	CacheStatus         string              `json:"cache_status"`
	ContentType         string              `json:"content_type"`
	ContentLength       int64               `json:"content_length"`
	Headers             http.Header         `json:"headers"`
	RedirectURL         string              `json:"redirect_url"`
	Performance         PerformanceMetrics  `json:"performance"`
	Timestamp           int64               `json:"timestamp"`
	RetryCount          int                 `json:"retry_count"`
	SkippedCrawl        bool                `json:"skipped_crawl,omitempty"`
	Links               map[string][]string `json:"links,omitempty"`
	SecondResponseTime  int64               `json:"second_response_time,omitempty"`
	SecondCacheStatus   string              `json:"second_cache_status,omitempty"`
	SecondContentLength int64               `json:"second_content_length,omitempty"`
	SecondHeaders       http.Header         `json:"second_headers,omitempty"`
	SecondPerformance   *PerformanceMetrics `json:"second_performance,omitempty"`
	CacheCheckAttempts  []CacheCheckAttempt `json:"cache_check_attempts,omitempty"`
	BodySample          []byte              `json:"-"` // Truncated body for tech detection (not serialised)
	Body                []byte              `json:"-"` // Full body for storage upload (not serialised)
}

CrawlResult represents the result of a URL crawl operation

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler represents a URL crawler with configuration and metrics

func New

func New(config *Config, id ...string) *Crawler

New creates a new Crawler instance with the given configuration and optional ID If config is nil, default configuration is used

func (*Crawler) CheckCacheStatus

func (c *Crawler) CheckCacheStatus(ctx context.Context, targetURL string) (string, error)

func (*Crawler) Config

func (c *Crawler) Config() *Config

Config returns the Crawler's configuration.

func (*Crawler) CreateHTTPClient

func (c *Crawler) CreateHTTPClient(timeout time.Duration) *http.Client

CreateHTTPClient returns a configured HTTP client with SSRF protection

func (*Crawler) DiscoverSitemaps

func (c *Crawler) DiscoverSitemaps(ctx context.Context, domain string) ([]string, error)

DiscoverSitemaps is a backward-compatible wrapper that only returns sitemaps

func (*Crawler) DiscoverSitemapsAndRobots

func (c *Crawler) DiscoverSitemapsAndRobots(ctx context.Context, domain string) (*SitemapDiscoveryResult, error)

DiscoverSitemapsAndRobots attempts to find sitemaps and parse robots.txt rules for a domain

func (*Crawler) FilterURLs

func (c *Crawler) FilterURLs(urls []string, includePaths, excludePaths []string) []string

FilterURLs filters URLs based on include/exclude patterns

func (*Crawler) GetUserAgent

func (c *Crawler) GetUserAgent() string

GetUserAgent returns the user agent string for this crawler

func (*Crawler) ParseSitemap

func (c *Crawler) ParseSitemap(ctx context.Context, sitemapURL string) ([]string, error)

ParseSitemap extracts URLs from a sitemap

func (*Crawler) WarmURL

func (c *Crawler) WarmURL(ctx context.Context, targetURL string, findLinks bool) (*CrawlResult, error)

WarmURL performs a crawl of the specified URL and returns the result. It respects context cancellation, enforces timeout, and treats non-2xx statuses as errors.

type PerformanceMetrics

type PerformanceMetrics struct {
	DNSLookupTime       int64 `json:"dns_lookup_time"`
	TCPConnectionTime   int64 `json:"tcp_connection_time"`
	TLSHandshakeTime    int64 `json:"tls_handshake_time"`
	TTFB                int64 `json:"ttfb"`
	ContentTransferTime int64 `json:"content_transfer_time"`
}

PerformanceMetrics holds detailed timing information for a request.

type RobotsRules

type RobotsRules struct {
	// CrawlDelay in seconds (0 means no delay specified)
	CrawlDelay int
	// Sitemaps found in robots.txt
	Sitemaps []string
	// DisallowPatterns are URL patterns that should not be crawled
	DisallowPatterns []string
	// AllowPatterns override DisallowPatterns (more specific)
	AllowPatterns []string
}

RobotsRules contains parsed robots.txt rules for a domain

func ParseRobotsTxt

func ParseRobotsTxt(ctx context.Context, domain string, userAgent string) (*RobotsRules, error)

ParseRobotsTxt fetches and parses robots.txt for a domain

The parser follows these rules in order of precedence: 1. If there are specific rules for "AdaptBot", use those 2. Otherwise, fall back to wildcard (*) rules

We intentionally don't match SEO crawler rules (AhrefsBot, MJ12bot, etc.) as those often have punitive 10s delays meant for aggressive crawlers. Most sites have no crawl-delay for the default * user-agent.

type Sitemap

type Sitemap struct {
	XMLName xml.Name `xml:"sitemap"`
	Loc     string   `xml:"loc"`
}

type SitemapDiscoveryResult

type SitemapDiscoveryResult struct {
	Sitemaps    []string
	RobotsRules *RobotsRules
}

SitemapDiscoveryResult contains both sitemaps and robots.txt rules

type SitemapIndex

type SitemapIndex struct {
	XMLName  xml.Name  `xml:"sitemapindex"`
	Sitemaps []Sitemap `xml:"sitemap"`
}

Create proper sitemap structs

type URL

type URL struct {
	XMLName xml.Name `xml:"url"`
	Loc     string   `xml:"loc"`
}

type URLSet

type URLSet struct {
	XMLName xml.Name `xml:"urlset"`
	URLs    []URL    `xml:"url"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL