/ rss / rss.go
rss.go
  1  package rss
  2  
  3  import (
  4  	// log "github.com/sirupsen/logrus"
  5  	"crypto/sha256"
  6  	"encoding/hex"
  7  	"encoding/json"
  8  	"fmt"
  9  	"time"
 10  
 11  	"github.com/google/uuid"
 12  	"go.uber.org/zap"
 13  
 14  	"strings"
 15  
 16  	"github.com/mrusme/journalist/crawler"
 17  	"github.com/mrusme/journalist/ent"
 18  
 19  	"github.com/araddon/dateparse"
 20  	"github.com/microcosm-cc/bluemonday"
 21  	"github.com/mmcdole/gofeed"
 22  )
 23  
 24  type Client struct {
 25  	parser          *gofeed.Parser
 26  	url             string
 27  	username        string
 28  	password        string
 29  	Feed            *gofeed.Feed
 30  	Items           *[]*gofeed.Item
 31  	ItemsCrawled    []crawler.ItemCrawled
 32  	exceptItemGUIDs []string
 33  	UpdatedAt       time.Time
 34  	logger          *zap.Logger
 35  }
 36  
 37  func NewClient(
 38  	feedUrl string,
 39  	username string,
 40  	password string,
 41  	crawl bool,
 42  	exceptItemGUIDs []string,
 43  	logger *zap.Logger,
 44  ) (*Client, []error) {
 45  	client := new(Client)
 46  	client.parser = gofeed.NewParser()
 47  	client.url = feedUrl
 48  	client.username = username
 49  	client.password = password
 50  	client.exceptItemGUIDs = exceptItemGUIDs
 51  	client.logger = logger
 52  
 53  	if errs := client.Sync(crawl); errs != nil {
 54  		return nil, errs
 55  	}
 56  
 57  	return client, nil
 58  }
 59  
 60  func (c *Client) Sync(crawl bool) []error {
 61  	var errs []error
 62  
 63  	c.logger.Debug(
 64  		"Starting RSS Sync procedure",
 65  		zap.Bool("crawl", crawl),
 66  	)
 67  
 68  	feedCrwl := crawler.New(c.logger)
 69  	defer feedCrwl.Close()
 70  	feedCrwl.SetLocation(c.url)
 71  	feedCrwl.SetBasicAuth(c.username, c.password)
 72  	feed, err := feedCrwl.ParseFeed()
 73  	if err != nil {
 74  		c.logger.Debug(
 75  			"RSS Sync error occurred for feed crawling",
 76  			zap.String("url", c.url),
 77  			zap.Error(err),
 78  		)
 79  		errs = append(errs, err)
 80  		return errs
 81  	}
 82  
 83  	c.Feed = feed
 84  	c.Items = &feed.Items
 85  	c.UpdatedAt = time.Now()
 86  
 87  	if crawl == true {
 88  		c.logger.Debug(
 89  			"RSS Sync starting crawling procedure",
 90  			zap.Int("exceptItemGUIDsLength", len(c.exceptItemGUIDs)),
 91  		)
 92  		crwl := crawler.New(c.logger)
 93  		defer crwl.Close()
 94  
 95  		for i := 0; i < len(c.Feed.Items); i++ {
 96  			var foundException bool = false
 97  			itemGUID := GenerateGUIDForItem(c.Feed.Items[i])
 98  			for _, exceptItemGUID := range c.exceptItemGUIDs {
 99  				if exceptItemGUID == itemGUID {
100  					c.logger.Debug(
101  						"Crawler found exception, breaking",
102  						zap.String("itemGUID", exceptItemGUID),
103  						zap.String("itemLink", c.Feed.Items[i].Link),
104  					)
105  					foundException = true
106  					break
107  				}
108  			}
109  
110  			if foundException == true {
111  				continue
112  			}
113  			c.logger.Debug(
114  				"Crawler found no exception, continuing with item",
115  				zap.String("itemLink", c.Feed.Items[i].Link),
116  			)
117  			crwl.Reset()
118  			crwl.SetLocation(c.Feed.Items[i].Link)
119  			crwl.SetBasicAuth(c.username, c.password)
120  			itemCrawled, err := crwl.GetReadable(false)
121  			if err != nil {
122  				c.logger.Debug(
123  					"Crawler failed to GetReadable",
124  					zap.String("itemLink", c.Feed.Items[i].Link),
125  					zap.Error(err),
126  				)
127  				errs = append(errs, err)
128  				continue
129  			}
130  
131  			c.ItemsCrawled = append(c.ItemsCrawled, itemCrawled)
132  		}
133  	}
134  
135  	return errs
136  }
137  
138  func (c *Client) SetFeed(
139  	feedLink string,
140  	username string,
141  	password string,
142  	dbFeedTmp *ent.FeedCreate,
143  ) *ent.FeedCreate {
144  	// TODO: Get system timezone
145  	ltz, _ := time.LoadLocation("UTC")
146  	time.Local = ltz
147  
148  	feedUpdated, err := dateparse.ParseLocal(c.Feed.Updated)
149  	if err != nil {
150  		feedUpdated = time.Now()
151  	}
152  	feedPublished, err := dateparse.ParseLocal(c.Feed.Published)
153  	if err != nil {
154  		feedPublished = time.Now()
155  	}
156  
157  	dbFeedTmp = dbFeedTmp.
158  		SetURL(feedLink).
159  		SetUsername(username).
160  		SetPassword(password).
161  		SetFeedTitle(c.Feed.Title).
162  		SetFeedDescription(c.Feed.Description).
163  		SetFeedLink(c.Feed.Link).
164  		SetFeedFeedLink(c.Feed.FeedLink).
165  		SetFeedUpdated(feedUpdated).
166  		SetFeedPublished(feedPublished).
167  		SetFeedLanguage(c.Feed.Language).
168  		SetFeedCopyright(c.Feed.Copyright).
169  		SetFeedGenerator(c.Feed.Generator).
170  		SetFeedCopyright(c.Feed.Copyright).
171  		SetFeedCategories(strings.Join(c.Feed.Categories, ", "))
172  
173  	if c.Feed.Author != nil {
174  		dbFeedTmp = dbFeedTmp.
175  			SetFeedAuthorName(c.Feed.Author.Name).
176  			SetFeedAuthorEmail(c.Feed.Author.Email)
177  	}
178  	if c.Feed.Image != nil {
179  		dbFeedTmp = dbFeedTmp.
180  			SetFeedImageTitle(c.Feed.Image.Title).
181  			SetFeedImageURL(c.Feed.Image.URL)
182  	}
183  
184  	return dbFeedTmp
185  }
186  
187  func (c *Client) SetItem(
188  	feedID uuid.UUID,
189  	idx int,
190  	dbItemTemp *ent.ItemCreate,
191  ) *ent.ItemCreate {
192  	var crawled crawler.ItemCrawled
193  	if len(c.ItemsCrawled) > idx {
194  		crawled = c.ItemsCrawled[idx]
195  	}
196  
197  	item := c.Feed.Items[idx]
198  
199  	// TODO: Get system timezone
200  	ltz, _ := time.LoadLocation("UTC")
201  	time.Local = ltz
202  
203  	itemUpdated, err := dateparse.ParseLocal(item.Updated)
204  	if err != nil {
205  		itemUpdated = time.Now()
206  	}
207  	itemPublished, err := dateparse.ParseLocal(item.Published)
208  	if err != nil {
209  		itemPublished = time.Now()
210  	}
211  
212  	var enclosureJson string = ""
213  	if item.Enclosures != nil {
214  		jsonbytes, err := json.Marshal(item.Enclosures)
215  		if err == nil {
216  			enclosureJson = string(jsonbytes)
217  		}
218  	}
219  
220  	itemDescription := bluemonday.
221  		StrictPolicy().
222  		Sanitize(item.Description)
223  
224  	dbItemTemp = dbItemTemp.
225  		SetFeedID(feedID).
226  		SetItemGUID(GenerateGUIDForItem(item)).
227  		SetItemTitle(item.Title).
228  		SetItemDescription(itemDescription).
229  		SetItemContent(item.Content).
230  		SetItemLink(item.Link).
231  		SetItemUpdated(itemUpdated).
232  		SetItemPublished(itemPublished).
233  		SetItemCategories(strings.Join(item.Categories, ",")).
234  		SetItemEnclosures(enclosureJson).
235  		SetCrawlerTitle(crawled.Title).
236  		SetCrawlerAuthor(crawled.Author).
237  		SetCrawlerExcerpt(crawled.Excerpt).
238  		SetCrawlerSiteName(crawled.SiteName).
239  		SetCrawlerImage(crawled.Image).
240  		SetCrawlerContentHTML(crawled.ContentHtml).
241  		SetCrawlerContentText(crawled.ContentText)
242  
243  	if item.Author != nil {
244  		dbItemTemp = dbItemTemp.
245  			SetItemAuthorName(item.Author.Name).
246  			SetItemAuthorEmail(item.Author.Email)
247  	}
248  
249  	if item.Image != nil {
250  		dbItemTemp = dbItemTemp.
251  			SetItemImageTitle(item.Image.Title).
252  			SetItemImageURL(item.Image.URL)
253  	}
254  
255  	return dbItemTemp
256  }
257  
258  func GenerateGUID(from string) string {
259  	h := sha256.New()
260  	h.Write([]byte(from))
261  	return hex.EncodeToString(
262  		h.Sum(nil),
263  	)
264  }
265  
266  func GenerateGUIDForItem(item *gofeed.Item) string {
267  	return GenerateGUID(
268  		fmt.Sprintf("%s%s", item.Link, item.Published),
269  	)
270  }