scrape.mjs
1 import * as util from "./util.mjs" 2 3 // https://emailregex.com/ 4 const EMAIL_RGX = new RegExp(/^(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/, "g") 5 6 function makeTokensFromMultilineText(text) { 7 const tokens = text.split("\n").join(" ").split(" ") 8 return tokens 9 } 10 11 function getFollowers(tokens) { 12 for (let i = 0; i < tokens.length; i++) { 13 const token = tokens[i] 14 if (token == "followers") { 15 return tokens[i-1] 16 } 17 } 18 return "" 19 } 20 21 function getEmail(tokens) { 22 return new Set(tokens.map(t => { 23 const matches = t.match(EMAIL_RGX) 24 if (matches == null) return matches 25 return matches[0] 26 }).filter(m => m!=null)) 27 } 28 29 export async function GetTwitchStreamerInfo(browser,userName) { 30 const page = await browser.newPage() 31 32 await page.goto(`https://twitch.tv/${userName}/about`) 33 34 let aboutSection, panelsSection 35 let aboutLinks, panelsLinks 36 let allText, aboutText, panelsText 37 38 const links = new Set() 39 40 try { 41 aboutSection = await page.locator(".about-section") 42 aboutText = await aboutSection.innerText() 43 44 if (page.locator(".channel-panels-container").isVisible()) { 45 panelsSection = await page.locator(".channel-panels-container") 46 panelsText = await panelsSection.innerText() 47 } 48 49 allText = [aboutText, panelsText].join(" ") 50 51 let anchors = [] 52 53 for (const anchor of await aboutSection.locator("a").all()) { 54 links.add(await anchor.getAttribute("href")) 55 } 56 for (const anchor of await aboutSection.locator("a").all()) { 57 links.add(await anchor.getAttribute("href")) 58 } 59 60 } catch (e) { 61 // it's fine if we fail here 62 } 63 64 let email = new Set() 65 let followers = "" 66 67 if (allText) { 68 const tokens = makeTokensFromMultilineText(allText) 69 email = getEmail(tokens) 70 followers = getFollowers(tokens) 71 } 72 73 await page.close() 74 75 return { 76 email: Array.from(email).join(","), 77 links: Array.from(links).join(","), 78 followers 79 } 80 } 81 82