/ utils / dev / scrapeChannels.ts
scrapeChannels.ts
  1  import fs from 'node:fs/promises';
  2  import { setTimeout } from 'node:timers/promises';
  3  
  4  import type { TimelineResponseCoubs } from '@/request/timeline';
  5  import type { Backup } from '@/storage/backup';
  6  import type { RawBlockedChannels } from '@/storage/blockedChannels';
  7  
  8  const DEST = 'blockedChannels.json';
  9  const size = Number.parseInt(process.argv[2], 10) || 1000;
 10  const MAX_RUNNING_ZEROES = 10;
 11  
 12  // get communities list:
 13  // 1. open `https://coub.com`
 14  // 2. run `Array.from(document.querySelectorAll('.main-menu__community-item'), node => node.dataset['communityPermalink']).sort()`
 15  // biome-ignore format: keep it flat
 16  const COMMUNITIES = ['animals-pets', 'anime', 'art', 'blogging', 'cars', 'cartoons', 'celebrity', 'dance', 'fashion', 'food-kitchen', 'gaming', 'live-pictures', 'mashup', 'memes', 'movies', 'music', 'nature-travel', 'science-technology', 'sports', 'standup-jokes'] as const;
 17  
 18  const data: RawBlockedChannels = {
 19  	id: new Array(size),
 20  	title: new Array(size),
 21  	permalink: new Array(size),
 22  };
 23  
 24  const backup: Partial<Backup> = {
 25  	blockedChannels: data,
 26  	blockedChannels$: { v: 1 },
 27  };
 28  
 29  const channelIds = new Set<number>();
 30  
 31  interface State {
 32  	page: number;
 33  	next: number | undefined;
 34  	runningZeros: number;
 35  }
 36  
 37  const states = COMMUNITIES.map((community): [typeof community, State] => [
 38  	community,
 39  	{
 40  		page: 1,
 41  		next: undefined,
 42  		runningZeros: 0,
 43  	},
 44  ]);
 45  
 46  let i = 0,
 47  	statesIndex = 0;
 48  
 49  do {
 50  	const [community, state] = states[statesIndex];
 51  
 52  	const page = state.page;
 53  	const res = (await (
 54  		await fetch(
 55  			`https://coub.com/api/v2/timeline/community/${community}/daily?page=${page}` +
 56  				(typeof state.next === 'number' ? `&anchor=${state.next}` : ''),
 57  		)
 58  	).json()) as TimelineResponseCoubs;
 59  	state.next = res.next;
 60  
 61  	const start = i;
 62  
 63  	for (const coub of res.coubs) {
 64  		if (!channelIds.has(coub.channel.id)) {
 65  			data.id[i] = coub.channel.id;
 66  			data.title[i] = coub.channel.title;
 67  			data.permalink[i] = coub.channel.permalink;
 68  			channelIds.add(coub.channel.id);
 69  			i += 1;
 70  
 71  			if (i === size) {
 72  				break;
 73  			}
 74  		}
 75  	}
 76  
 77  	const diff = i - start;
 78  	console.log(`${i} / ${size} (page: ${page}; +${diff}; ${community})`);
 79  	state.page += 1;
 80  
 81  	let isRotate = true;
 82  
 83  	if (diff > 0) {
 84  		state.runningZeros = 0;
 85  	} else {
 86  		state.runningZeros += 1;
 87  
 88  		if (state.runningZeros >= MAX_RUNNING_ZEROES) {
 89  			states.splice(statesIndex, 1);
 90  			isRotate = false;
 91  
 92  			if (states.length === 0) {
 93  				break;
 94  			}
 95  		}
 96  	}
 97  
 98  	if (isRotate) {
 99  		statesIndex = (statesIndex + 1) % states.length;
100  	}
101  
102  	await setTimeout(1000);
103  } while (i < size);
104  
105  data.id.length = i;
106  data.title.length = i;
107  data.permalink.length = i;
108  
109  await fs.writeFile(DEST, JSON.stringify(backup));
110  
111  const formattedSize = new Intl.NumberFormat('en').format(i);
112  console.log(`Wrote ${formattedSize} channels to ${DEST}`);