rss.go
1 package rss 2 3 import ( 4 // log "github.com/sirupsen/logrus" 5 "crypto/sha256" 6 "encoding/hex" 7 "encoding/json" 8 "fmt" 9 "time" 10 11 "github.com/google/uuid" 12 "go.uber.org/zap" 13 14 "strings" 15 16 "github.com/mrusme/journalist/crawler" 17 "github.com/mrusme/journalist/ent" 18 19 "github.com/araddon/dateparse" 20 "github.com/microcosm-cc/bluemonday" 21 "github.com/mmcdole/gofeed" 22 ) 23 24 type Client struct { 25 parser *gofeed.Parser 26 url string 27 username string 28 password string 29 Feed *gofeed.Feed 30 Items *[]*gofeed.Item 31 ItemsCrawled []crawler.ItemCrawled 32 exceptItemGUIDs []string 33 UpdatedAt time.Time 34 logger *zap.Logger 35 } 36 37 func NewClient( 38 feedUrl string, 39 username string, 40 password string, 41 crawl bool, 42 exceptItemGUIDs []string, 43 logger *zap.Logger, 44 ) (*Client, []error) { 45 client := new(Client) 46 client.parser = gofeed.NewParser() 47 client.url = feedUrl 48 client.username = username 49 client.password = password 50 client.exceptItemGUIDs = exceptItemGUIDs 51 client.logger = logger 52 53 if errs := client.Sync(crawl); errs != nil { 54 return nil, errs 55 } 56 57 return client, nil 58 } 59 60 func (c *Client) Sync(crawl bool) []error { 61 var errs []error 62 63 c.logger.Debug( 64 "Starting RSS Sync procedure", 65 zap.Bool("crawl", crawl), 66 ) 67 68 feedCrwl := crawler.New(c.logger) 69 defer feedCrwl.Close() 70 feedCrwl.SetLocation(c.url) 71 feedCrwl.SetBasicAuth(c.username, c.password) 72 feed, err := feedCrwl.ParseFeed() 73 if err != nil { 74 c.logger.Debug( 75 "RSS Sync error occurred for feed crawling", 76 zap.String("url", c.url), 77 zap.Error(err), 78 ) 79 errs = append(errs, err) 80 return errs 81 } 82 83 c.Feed = feed 84 c.Items = &feed.Items 85 c.UpdatedAt = time.Now() 86 87 if crawl == true { 88 c.logger.Debug( 89 "RSS Sync starting crawling procedure", 90 zap.Int("exceptItemGUIDsLength", len(c.exceptItemGUIDs)), 91 ) 92 crwl := crawler.New(c.logger) 93 defer crwl.Close() 94 95 for i := 0; i < len(c.Feed.Items); i++ { 96 var foundException bool = false 97 itemGUID := GenerateGUIDForItem(c.Feed.Items[i]) 98 for _, exceptItemGUID := range c.exceptItemGUIDs { 99 if exceptItemGUID == itemGUID { 100 c.logger.Debug( 101 "Crawler found exception, breaking", 102 zap.String("itemGUID", exceptItemGUID), 103 zap.String("itemLink", c.Feed.Items[i].Link), 104 ) 105 foundException = true 106 break 107 } 108 } 109 110 if foundException == true { 111 continue 112 } 113 c.logger.Debug( 114 "Crawler found no exception, continuing with item", 115 zap.String("itemLink", c.Feed.Items[i].Link), 116 ) 117 crwl.Reset() 118 crwl.SetLocation(c.Feed.Items[i].Link) 119 crwl.SetBasicAuth(c.username, c.password) 120 itemCrawled, err := crwl.GetReadable(false) 121 if err != nil { 122 c.logger.Debug( 123 "Crawler failed to GetReadable", 124 zap.String("itemLink", c.Feed.Items[i].Link), 125 zap.Error(err), 126 ) 127 errs = append(errs, err) 128 continue 129 } 130 131 c.ItemsCrawled = append(c.ItemsCrawled, itemCrawled) 132 } 133 } 134 135 return errs 136 } 137 138 func (c *Client) SetFeed( 139 feedLink string, 140 username string, 141 password string, 142 dbFeedTmp *ent.FeedCreate, 143 ) *ent.FeedCreate { 144 // TODO: Get system timezone 145 ltz, _ := time.LoadLocation("UTC") 146 time.Local = ltz 147 148 feedUpdated, err := dateparse.ParseLocal(c.Feed.Updated) 149 if err != nil { 150 feedUpdated = time.Now() 151 } 152 feedPublished, err := dateparse.ParseLocal(c.Feed.Published) 153 if err != nil { 154 feedPublished = time.Now() 155 } 156 157 dbFeedTmp = dbFeedTmp. 158 SetURL(feedLink). 159 SetUsername(username). 160 SetPassword(password). 161 SetFeedTitle(c.Feed.Title). 162 SetFeedDescription(c.Feed.Description). 163 SetFeedLink(c.Feed.Link). 164 SetFeedFeedLink(c.Feed.FeedLink). 165 SetFeedUpdated(feedUpdated). 166 SetFeedPublished(feedPublished). 167 SetFeedLanguage(c.Feed.Language). 168 SetFeedCopyright(c.Feed.Copyright). 169 SetFeedGenerator(c.Feed.Generator). 170 SetFeedCopyright(c.Feed.Copyright). 171 SetFeedCategories(strings.Join(c.Feed.Categories, ", ")) 172 173 if c.Feed.Author != nil { 174 dbFeedTmp = dbFeedTmp. 175 SetFeedAuthorName(c.Feed.Author.Name). 176 SetFeedAuthorEmail(c.Feed.Author.Email) 177 } 178 if c.Feed.Image != nil { 179 dbFeedTmp = dbFeedTmp. 180 SetFeedImageTitle(c.Feed.Image.Title). 181 SetFeedImageURL(c.Feed.Image.URL) 182 } 183 184 return dbFeedTmp 185 } 186 187 func (c *Client) SetItem( 188 feedID uuid.UUID, 189 idx int, 190 dbItemTemp *ent.ItemCreate, 191 ) *ent.ItemCreate { 192 var crawled crawler.ItemCrawled 193 if len(c.ItemsCrawled) > idx { 194 crawled = c.ItemsCrawled[idx] 195 } 196 197 item := c.Feed.Items[idx] 198 199 // TODO: Get system timezone 200 ltz, _ := time.LoadLocation("UTC") 201 time.Local = ltz 202 203 itemUpdated, err := dateparse.ParseLocal(item.Updated) 204 if err != nil { 205 itemUpdated = time.Now() 206 } 207 itemPublished, err := dateparse.ParseLocal(item.Published) 208 if err != nil { 209 itemPublished = time.Now() 210 } 211 212 var enclosureJson string = "" 213 if item.Enclosures != nil { 214 jsonbytes, err := json.Marshal(item.Enclosures) 215 if err == nil { 216 enclosureJson = string(jsonbytes) 217 } 218 } 219 220 itemDescription := bluemonday. 221 StrictPolicy(). 222 Sanitize(item.Description) 223 224 dbItemTemp = dbItemTemp. 225 SetFeedID(feedID). 226 SetItemGUID(GenerateGUIDForItem(item)). 227 SetItemTitle(item.Title). 228 SetItemDescription(itemDescription). 229 SetItemContent(item.Content). 230 SetItemLink(item.Link). 231 SetItemUpdated(itemUpdated). 232 SetItemPublished(itemPublished). 233 SetItemCategories(strings.Join(item.Categories, ",")). 234 SetItemEnclosures(enclosureJson). 235 SetCrawlerTitle(crawled.Title). 236 SetCrawlerAuthor(crawled.Author). 237 SetCrawlerExcerpt(crawled.Excerpt). 238 SetCrawlerSiteName(crawled.SiteName). 239 SetCrawlerImage(crawled.Image). 240 SetCrawlerContentHTML(crawled.ContentHtml). 241 SetCrawlerContentText(crawled.ContentText) 242 243 if item.Author != nil { 244 dbItemTemp = dbItemTemp. 245 SetItemAuthorName(item.Author.Name). 246 SetItemAuthorEmail(item.Author.Email) 247 } 248 249 if item.Image != nil { 250 dbItemTemp = dbItemTemp. 251 SetItemImageTitle(item.Image.Title). 252 SetItemImageURL(item.Image.URL) 253 } 254 255 return dbItemTemp 256 } 257 258 func GenerateGUID(from string) string { 259 h := sha256.New() 260 h.Write([]byte(from)) 261 return hex.EncodeToString( 262 h.Sum(nil), 263 ) 264 } 265 266 func GenerateGUIDForItem(item *gofeed.Item) string { 267 return GenerateGUID( 268 fmt.Sprintf("%s%s", item.Link, item.Published), 269 ) 270 }