scrapeChannels.ts
1 import fs from 'node:fs/promises'; 2 import { setTimeout } from 'node:timers/promises'; 3 4 import type { TimelineResponseCoubs } from '@/request/timeline'; 5 import type { Backup } from '@/storage/backup'; 6 import type { RawBlockedChannels } from '@/storage/blockedChannels'; 7 8 const DEST = 'blockedChannels.json'; 9 const size = Number.parseInt(process.argv[2], 10) || 1000; 10 const MAX_RUNNING_ZEROES = 10; 11 12 // get communities list: 13 // 1. open `https://coub.com` 14 // 2. run `Array.from(document.querySelectorAll('.main-menu__community-item'), node => node.dataset['communityPermalink']).sort()` 15 // biome-ignore format: keep it flat 16 const COMMUNITIES = ['animals-pets', 'anime', 'art', 'blogging', 'cars', 'cartoons', 'celebrity', 'dance', 'fashion', 'food-kitchen', 'gaming', 'live-pictures', 'mashup', 'memes', 'movies', 'music', 'nature-travel', 'science-technology', 'sports', 'standup-jokes'] as const; 17 18 const data: RawBlockedChannels = { 19 id: new Array(size), 20 title: new Array(size), 21 permalink: new Array(size), 22 }; 23 24 const backup: Partial<Backup> = { 25 blockedChannels: data, 26 blockedChannels$: { v: 1 }, 27 }; 28 29 const channelIds = new Set<number>(); 30 31 interface State { 32 page: number; 33 next: number | undefined; 34 runningZeros: number; 35 } 36 37 const states = COMMUNITIES.map((community): [typeof community, State] => [ 38 community, 39 { 40 page: 1, 41 next: undefined, 42 runningZeros: 0, 43 }, 44 ]); 45 46 let i = 0, 47 statesIndex = 0; 48 49 do { 50 const [community, state] = states[statesIndex]; 51 52 const page = state.page; 53 const res = (await ( 54 await fetch( 55 `https://coub.com/api/v2/timeline/community/${community}/daily?page=${page}` + 56 (typeof state.next === 'number' ? `&anchor=${state.next}` : ''), 57 ) 58 ).json()) as TimelineResponseCoubs; 59 state.next = res.next; 60 61 const start = i; 62 63 for (const coub of res.coubs) { 64 if (!channelIds.has(coub.channel.id)) { 65 data.id[i] = coub.channel.id; 66 data.title[i] = coub.channel.title; 67 data.permalink[i] = coub.channel.permalink; 68 channelIds.add(coub.channel.id); 69 i += 1; 70 71 if (i === size) { 72 break; 73 } 74 } 75 } 76 77 const diff = i - start; 78 console.log(`${i} / ${size} (page: ${page}; +${diff}; ${community})`); 79 state.page += 1; 80 81 let isRotate = true; 82 83 if (diff > 0) { 84 state.runningZeros = 0; 85 } else { 86 state.runningZeros += 1; 87 88 if (state.runningZeros >= MAX_RUNNING_ZEROES) { 89 states.splice(statesIndex, 1); 90 isRotate = false; 91 92 if (states.length === 0) { 93 break; 94 } 95 } 96 } 97 98 if (isRotate) { 99 statesIndex = (statesIndex + 1) % states.length; 100 } 101 102 await setTimeout(1000); 103 } while (i < size); 104 105 data.id.length = i; 106 data.title.length = i; 107 data.permalink.length = i; 108 109 await fs.writeFile(DEST, JSON.stringify(backup)); 110 111 const formattedSize = new Intl.NumberFormat('en').format(i); 112 console.log(`Wrote ${formattedSize} channels to ${DEST}`);