crawler.go
1 package crawler 2 3 import ( 4 "bufio" 5 "io" 6 "net/http" 7 "net/http/cookiejar" 8 "net/url" 9 "os" 10 "strings" 11 12 "go.uber.org/zap" 13 "golang.org/x/net/publicsuffix" 14 15 "github.com/Danny-Dasilva/CycleTLS/cycletls" 16 scraper "github.com/memclutter/go-cloudflare-scraper" 17 18 "github.com/go-shiori/go-readability" 19 ) 20 21 type ItemCrawled struct { 22 Title string 23 Author string 24 Excerpt string 25 SiteName string 26 Image string 27 ContentHtml string 28 ContentText string 29 } 30 31 type Crawler struct { 32 source io.ReadCloser 33 sourceLocation string 34 sourceLocationUrl *url.URL 35 36 UserAgent string 37 38 username string 39 password string 40 41 contentType string 42 43 logger *zap.Logger 44 } 45 46 func New(logger *zap.Logger) *Crawler { 47 crawler := new(Crawler) 48 crawler.logger = logger 49 50 crawler.source = nil 51 crawler.Reset() 52 return crawler 53 } 54 55 func (c *Crawler) Close() { 56 if c.source != nil { 57 c.source.Close() 58 c.source = nil 59 } 60 } 61 62 func (c *Crawler) Reset() { 63 c.Close() 64 c.sourceLocation = "" 65 c.sourceLocationUrl = nil 66 67 c.UserAgent = 68 "Mozilla/5.0 AppleWebKit/537.36 " + 69 "(KHTML, like Gecko; compatible; " + 70 "Googlebot/2.1; +http://www.google.com/bot.html)" 71 72 c.username = "" 73 c.password = "" 74 75 c.contentType = "" 76 } 77 78 func (c *Crawler) SetLocation(sourceLocation string) error { 79 var urlUrl *url.URL 80 var err error 81 82 if sourceLocation != "-" { 83 urlUrl, err = url.Parse(sourceLocation) 84 if err != nil { 85 return err 86 } 87 } 88 89 c.sourceLocation = sourceLocation 90 c.sourceLocationUrl = urlUrl 91 92 return nil 93 } 94 95 func (c *Crawler) SetBasicAuth(username string, password string) { 96 c.username = username 97 c.password = password 98 } 99 100 func (c *Crawler) GetSource() io.ReadCloser { 101 return c.source 102 } 103 104 func (c *Crawler) GetReadable(useCycleTLS bool) (ItemCrawled, error) { 105 if err := c.FromAuto(useCycleTLS); err != nil { 106 return ItemCrawled{}, err 107 } 108 109 article, err := readability.FromReader(c.source, c.sourceLocationUrl) 110 if err != nil { 111 return ItemCrawled{}, err 112 } 113 114 item := ItemCrawled{ 115 Title: article.Title, 116 Author: article.Byline, 117 Excerpt: article.Excerpt, 118 SiteName: article.SiteName, 119 Image: article.Image, 120 ContentHtml: article.Content, 121 ContentText: article.TextContent, 122 } 123 124 return item, nil 125 } 126 127 func (c *Crawler) FromAuto(useCycleTLS bool) error { 128 var err error 129 130 switch c.sourceLocation { 131 case "-": 132 err = c.FromStdin() 133 default: 134 switch c.sourceLocationUrl.Scheme { 135 case "http", "https": 136 if useCycleTLS { 137 err = c.FromHTTPCycleTLS() 138 } else { 139 err = c.FromHTTP() 140 } 141 default: 142 err = c.FromFile() 143 } 144 } 145 146 return err 147 } 148 149 func (c *Crawler) FromHTTP() error { 150 jar, err := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List}) 151 if err != nil { 152 return err 153 } 154 155 scraper, err := scraper.NewTransport(http.DefaultTransport) 156 client := &http.Client{ 157 Jar: jar, 158 Transport: scraper, 159 } 160 161 req, err := http.NewRequest("GET", c.sourceLocation, nil) 162 if err != nil { 163 return err 164 } 165 166 req.Header.Set("User-Agent", 167 c.UserAgent) 168 req.Header.Set("Accept", 169 "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,"+ 170 "image/webp,*/*;q=0.8") 171 req.Header.Set("Accept-Language", 172 "en-US,en;q=0.5") 173 req.Header.Set("DNT", 174 "1") 175 176 if c.username != "" && c.password != "" { 177 req.SetBasicAuth(c.username, c.password) 178 } 179 180 resp, err := client.Do(req) 181 if err != nil { 182 return err 183 } 184 185 c.Close() 186 c.source = resp.Body 187 return nil 188 } 189 190 func (c *Crawler) FromHTTPCycleTLS() error { 191 client := cycletls.Init() 192 193 resp, err := client.Do(c.sourceLocation, cycletls.Options{ 194 Body: "", 195 Ja3: "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0", 196 UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0", 197 }, "GET") 198 if err != nil { 199 return err 200 } 201 202 c.Close() 203 c.source = io.NopCloser(strings.NewReader(resp.Body)) 204 return nil 205 } 206 207 func (c *Crawler) FromFile() error { 208 file, err := os.Open(c.sourceLocation) 209 if err != nil { 210 return err 211 } 212 213 c.Close() 214 c.source = file 215 return nil 216 } 217 218 func (c *Crawler) FromStdin() error { 219 c.Close() 220 c.source = io.NopCloser(bufio.NewReader(os.Stdin)) 221 return nil 222 } 223 224 func (c *Crawler) Detect() error { 225 buf := make([]byte, 512) 226 _, err := c.source.Read(buf) 227 if err != nil { 228 return err 229 } 230 231 c.contentType = http.DetectContentType(buf) 232 return nil 233 } 234 235 func (c *Crawler) GetContentType() string { 236 return c.contentType 237 }