/ crawler / crawler.go
crawler.go
  1  package crawler
  2  
  3  import (
  4  	"bufio"
  5  	"io"
  6  	"net/http"
  7  	"net/http/cookiejar"
  8  	"net/url"
  9  	"os"
 10  	"strings"
 11  
 12  	"go.uber.org/zap"
 13  	"golang.org/x/net/publicsuffix"
 14  
 15  	"github.com/Danny-Dasilva/CycleTLS/cycletls"
 16  	scraper "github.com/memclutter/go-cloudflare-scraper"
 17  
 18  	"github.com/go-shiori/go-readability"
 19  )
 20  
 21  type ItemCrawled struct {
 22  	Title       string
 23  	Author      string
 24  	Excerpt     string
 25  	SiteName    string
 26  	Image       string
 27  	ContentHtml string
 28  	ContentText string
 29  }
 30  
 31  type Crawler struct {
 32  	source            io.ReadCloser
 33  	sourceLocation    string
 34  	sourceLocationUrl *url.URL
 35  
 36  	UserAgent string
 37  
 38  	username string
 39  	password string
 40  
 41  	contentType string
 42  
 43  	logger *zap.Logger
 44  }
 45  
 46  func New(logger *zap.Logger) *Crawler {
 47  	crawler := new(Crawler)
 48  	crawler.logger = logger
 49  
 50  	crawler.source = nil
 51  	crawler.Reset()
 52  	return crawler
 53  }
 54  
 55  func (c *Crawler) Close() {
 56  	if c.source != nil {
 57  		c.source.Close()
 58  		c.source = nil
 59  	}
 60  }
 61  
 62  func (c *Crawler) Reset() {
 63  	c.Close()
 64  	c.sourceLocation = ""
 65  	c.sourceLocationUrl = nil
 66  
 67  	c.UserAgent =
 68  		"Mozilla/5.0 AppleWebKit/537.36 " +
 69  			"(KHTML, like Gecko; compatible; " +
 70  			"Googlebot/2.1; +http://www.google.com/bot.html)"
 71  
 72  	c.username = ""
 73  	c.password = ""
 74  
 75  	c.contentType = ""
 76  }
 77  
 78  func (c *Crawler) SetLocation(sourceLocation string) error {
 79  	var urlUrl *url.URL
 80  	var err error
 81  
 82  	if sourceLocation != "-" {
 83  		urlUrl, err = url.Parse(sourceLocation)
 84  		if err != nil {
 85  			return err
 86  		}
 87  	}
 88  
 89  	c.sourceLocation = sourceLocation
 90  	c.sourceLocationUrl = urlUrl
 91  
 92  	return nil
 93  }
 94  
 95  func (c *Crawler) SetBasicAuth(username string, password string) {
 96  	c.username = username
 97  	c.password = password
 98  }
 99  
100  func (c *Crawler) GetSource() io.ReadCloser {
101  	return c.source
102  }
103  
104  func (c *Crawler) GetReadable(useCycleTLS bool) (ItemCrawled, error) {
105  	if err := c.FromAuto(useCycleTLS); err != nil {
106  		return ItemCrawled{}, err
107  	}
108  
109  	article, err := readability.FromReader(c.source, c.sourceLocationUrl)
110  	if err != nil {
111  		return ItemCrawled{}, err
112  	}
113  
114  	item := ItemCrawled{
115  		Title:       article.Title,
116  		Author:      article.Byline,
117  		Excerpt:     article.Excerpt,
118  		SiteName:    article.SiteName,
119  		Image:       article.Image,
120  		ContentHtml: article.Content,
121  		ContentText: article.TextContent,
122  	}
123  
124  	return item, nil
125  }
126  
127  func (c *Crawler) FromAuto(useCycleTLS bool) error {
128  	var err error
129  
130  	switch c.sourceLocation {
131  	case "-":
132  		err = c.FromStdin()
133  	default:
134  		switch c.sourceLocationUrl.Scheme {
135  		case "http", "https":
136  			if useCycleTLS {
137  				err = c.FromHTTPCycleTLS()
138  			} else {
139  				err = c.FromHTTP()
140  			}
141  		default:
142  			err = c.FromFile()
143  		}
144  	}
145  
146  	return err
147  }
148  
149  func (c *Crawler) FromHTTP() error {
150  	jar, err := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List})
151  	if err != nil {
152  		return err
153  	}
154  
155  	scraper, err := scraper.NewTransport(http.DefaultTransport)
156  	client := &http.Client{
157  		Jar:       jar,
158  		Transport: scraper,
159  	}
160  
161  	req, err := http.NewRequest("GET", c.sourceLocation, nil)
162  	if err != nil {
163  		return err
164  	}
165  
166  	req.Header.Set("User-Agent",
167  		c.UserAgent)
168  	req.Header.Set("Accept",
169  		"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,"+
170  			"image/webp,*/*;q=0.8")
171  	req.Header.Set("Accept-Language",
172  		"en-US,en;q=0.5")
173  	req.Header.Set("DNT",
174  		"1")
175  
176  	if c.username != "" && c.password != "" {
177  		req.SetBasicAuth(c.username, c.password)
178  	}
179  
180  	resp, err := client.Do(req)
181  	if err != nil {
182  		return err
183  	}
184  
185  	c.Close()
186  	c.source = resp.Body
187  	return nil
188  }
189  
190  func (c *Crawler) FromHTTPCycleTLS() error {
191  	client := cycletls.Init()
192  
193  	resp, err := client.Do(c.sourceLocation, cycletls.Options{
194  		Body:      "",
195  		Ja3:       "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0",
196  		UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0",
197  	}, "GET")
198  	if err != nil {
199  		return err
200  	}
201  
202  	c.Close()
203  	c.source = io.NopCloser(strings.NewReader(resp.Body))
204  	return nil
205  }
206  
207  func (c *Crawler) FromFile() error {
208  	file, err := os.Open(c.sourceLocation)
209  	if err != nil {
210  		return err
211  	}
212  
213  	c.Close()
214  	c.source = file
215  	return nil
216  }
217  
218  func (c *Crawler) FromStdin() error {
219  	c.Close()
220  	c.source = io.NopCloser(bufio.NewReader(os.Stdin))
221  	return nil
222  }
223  
224  func (c *Crawler) Detect() error {
225  	buf := make([]byte, 512)
226  	_, err := c.source.Read(buf)
227  	if err != nil {
228  		return err
229  	}
230  
231  	c.contentType = http.DetectContentType(buf)
232  	return nil
233  }
234  
235  func (c *Crawler) GetContentType() string {
236  	return c.contentType
237  }