/ allthethings / utils.py
utils.py
1 import jwt 2 import re 3 import ipaddress 4 import flask 5 import functools 6 import datetime 7 import forex_python.converter 8 import cachetools 9 import babel.numbers 10 import babel 11 import os 12 import base64 13 import base58 14 import hashlib 15 import urllib.parse 16 import orjson 17 import isbnlib 18 import math 19 import bip_utils 20 import shortuuid 21 import pymysql 22 import httpx 23 import indexed_zstd 24 import threading 25 26 from flask_babel import gettext, get_babel, force_locale 27 28 from flask import Blueprint, request, g, make_response, render_template 29 from flask_cors import cross_origin 30 from sqlalchemy import select, func, text, inspect 31 from sqlalchemy.orm import Session 32 from flask_babel import format_timedelta 33 34 from allthethings.extensions import es, es_aux, engine, mariapersist_engine, MariapersistDownloadsTotalByMd5, mail, MariapersistDownloadsHourlyByMd5, MariapersistDownloadsHourly, MariapersistMd5Report, MariapersistAccounts, MariapersistComments, MariapersistReactions, MariapersistLists, MariapersistListEntries, MariapersistDonations, MariapersistDownloads, MariapersistFastDownloadAccess 35 from config.settings import SECRET_KEY, DOWNLOADS_SECRET_KEY, MEMBERS_TELEGRAM_URL, FLASK_DEBUG, PAYMENT2_URL, PAYMENT2_API_KEY, PAYMENT2_PROXIES, FAST_PARTNER_SERVER1, HOODPAY_URL, HOODPAY_AUTH 36 37 FEATURE_FLAGS = {} 38 39 FAST_DOWNLOAD_DOMAINS = [x for x in [FAST_PARTNER_SERVER1, 'wbsg8v.xyz', 'momot.rs'] if x is not None] 40 # SLOW_DOWNLOAD_DOMAINS = ['momot.rs', 'ktxr.rs', 'nrzr.li'] 41 SLOW_DOWNLOAD_DOMAINS = ['momot.rs', 'nrzr.li', 'wbsg8v.xyz'] 42 43 def validate_canonical_md5s(canonical_md5s): 44 return all([bool(re.match(r"^[a-f\d]{32}$", canonical_md5)) for canonical_md5 in canonical_md5s]) 45 46 def validate_ol_editions(ol_editions): 47 return all([bool(re.match(r"^OL[\d]+M$", ol_edition)) for ol_edition in ol_editions]) 48 49 def validate_oclc_ids(oclc_ids): 50 return all([str(oclc_id).isdigit() for oclc_id in oclc_ids]) 51 52 def validate_duxiu_ssids(duxiu_ssids): 53 return all([str(duxiu_ssid).isdigit() for duxiu_ssid in duxiu_ssids]) 54 55 def validate_aarecord_ids(aarecord_ids): 56 try: 57 split_ids = split_aarecord_ids(aarecord_ids) 58 except: 59 return False 60 return validate_canonical_md5s(split_ids['md5']) and validate_ol_editions(split_ids['ol']) and validate_oclc_ids(split_ids['oclc']) and validate_duxiu_ssids(split_ids['duxiu_ssid']) 61 62 def split_aarecord_ids(aarecord_ids): 63 ret = { 64 'md5': [], 65 'ia': [], 66 'isbn': [], 67 'ol': [], 68 'doi': [], 69 'oclc': [], 70 'duxiu_ssid': [], 71 'cadal_ssno': [], 72 } 73 for aarecord_id in aarecord_ids: 74 split_aarecord_id = aarecord_id.split(':', 1) 75 ret[split_aarecord_id[0]].append(split_aarecord_id[1]) 76 return ret 77 78 def doi_is_isbn(doi): 79 return doi.startswith('10.978.') or doi.startswith('10.979.') 80 81 def scidb_info(aarecord, additional=None): 82 if additional is None: 83 additional = aarecord['additional'] 84 85 valid_dois = [doi for doi in aarecord['file_unified_data']['identifiers_unified'].get('doi') or [] if not doi_is_isbn(doi)] 86 if len(valid_dois) == 0: 87 return None 88 if aarecord['file_unified_data']['extension_best'] != "pdf": 89 return None 90 91 scihub_link = None 92 scihub_doi = aarecord.get('scihub_doi') or [] 93 if len(scihub_doi) > 0: 94 scihub_link = f"https://sci-hub.ru/{scihub_doi[0]['doi']}" 95 96 if (aarecord['file_unified_data']['content_type'] != "journal_article") and (scihub_link is None): 97 return None 98 99 path_info = None 100 if len(additional['partner_url_paths']) > 0: 101 path_info = additional['partner_url_paths'][0] 102 103 if path_info: 104 priority = 1 105 elif scihub_link: 106 priority = 2 107 else: 108 return None 109 110 return { "priority": priority, "doi": valid_dois[0], "path_info": path_info, "scihub_link": scihub_link } 111 112 JWT_PREFIX = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.' 113 114 ACCOUNT_COOKIE_NAME = "aa_account_id2" 115 116 def strip_jwt_prefix(jwt_payload): 117 if not jwt_payload.startswith(JWT_PREFIX): 118 raise Exception("Invalid jwt_payload; wrong prefix") 119 return jwt_payload[len(JWT_PREFIX):] 120 121 def get_account_id(cookies): 122 if len(cookies.get(ACCOUNT_COOKIE_NAME, "")) > 0: 123 account_data = jwt.decode( 124 jwt=JWT_PREFIX + cookies[ACCOUNT_COOKIE_NAME], 125 key=SECRET_KEY, 126 algorithms=["HS256"], 127 options={ "verify_signature": True, "require": ["iat"], "verify_iat": True } 128 ) 129 return account_data["a"] 130 return None 131 132 def secret_key_from_account_id(account_id): 133 hashkey = base58.b58encode(hashlib.md5(f"{SECRET_KEY}{account_id}".encode('utf-8')).digest()).decode('utf-8') 134 return f"{account_id}{hashkey}" 135 136 def account_id_from_secret_key(secret_key): 137 account_id = secret_key[0:7] 138 correct_secret_key = secret_key_from_account_id(account_id) 139 if secret_key != correct_secret_key: 140 return None 141 return account_id 142 143 def get_domain_lang_code(locale): 144 if locale.script == 'Hant': 145 return 'tw' 146 elif str(locale) == 'nb_NO': 147 return 'no' 148 else: 149 return str(locale) 150 151 def domain_lang_code_to_full_lang_code(domain_lang_code): 152 if domain_lang_code == "tw": 153 return 'zh_Hant' 154 elif domain_lang_code == "no": 155 return 'nb_NO' 156 else: 157 return domain_lang_code 158 159 def get_full_lang_code(locale): 160 return str(locale) 161 162 def get_base_lang_code(locale): 163 return locale.language 164 165 # Adapted from https://github.com/python-babel/flask-babel/blob/69d3340cd0ff52f3e23a47518285a7e6d8f8c640/flask_babel/__init__.py#L175 166 def list_translations(): 167 # return [locale for locale in babel.list_translations() if is_locale(locale)] 168 result = [] 169 for dirname in get_babel().translation_directories: 170 if not os.path.isdir(dirname): 171 continue 172 for folder in os.listdir(dirname): 173 locale_dir = os.path.join(dirname, folder, 'LC_MESSAGES') 174 if not os.path.isdir(locale_dir): 175 continue 176 if any(x.endswith('.mo') for x in os.listdir(locale_dir)): 177 try: 178 result.append(babel.Locale.parse(folder)) 179 except babel.UnknownLocaleError: 180 pass 181 return result 182 183 # Example to convert back from MySQL to IPv4: 184 # import ipaddress 185 # ipaddress.ip_address(0x2002AC16000100000000000000000000).sixtofour 186 # ipaddress.ip_address().sixtofour 187 def canonical_ip_bytes(ip): 188 # Canonicalize to IPv6 189 ipv6 = ipaddress.ip_address(ip) 190 if ipv6.version == 4: 191 # https://stackoverflow.com/a/19853184 192 prefix = int(ipaddress.IPv6Address('2002::')) 193 ipv6 = ipaddress.ip_address(prefix | (int(ipv6) << 80)) 194 return ipv6.packed 195 196 197 def public_cache(cloudflare_minutes=0, minutes=0): 198 def fwrap(f): 199 @functools.wraps(f) 200 def wrapped_f(*args, **kwargs): 201 r = flask.make_response(f(*args, **kwargs)) 202 if r.headers.get('Cache-Control') is not None: 203 r.headers.add('Cloudflare-CDN-Cache-Control', r.headers.get('Cache-Control')) 204 elif r.status_code <= 299: 205 r.headers.add('Cache-Control', f"public,max-age={int(60 * minutes)},s-maxage={int(60 * minutes)}") 206 r.headers.add('Cloudflare-CDN-Cache-Control', f"max-age={int(60 * cloudflare_minutes)}") 207 else: 208 r.headers.add('Cache-Control', 'no-cache') 209 r.headers.add('Cloudflare-CDN-Cache-Control', 'no-cache') 210 return r 211 return wrapped_f 212 return fwrap 213 214 def no_cache(): 215 def fwrap(f): 216 @functools.wraps(f) 217 def wrapped_f(*args, **kwargs): 218 r = flask.make_response(f(*args, **kwargs)) 219 r.headers.add('Cache-Control', 'no-cache') 220 r.headers.add('Cloudflare-CDN-Cache-Control', 'no-cache') 221 return r 222 return wrapped_f 223 return fwrap 224 225 def get_md5_report_type_mapping(): 226 return { 227 'metadata': gettext('common.md5_report_type_mapping.metadata'), 228 'download': gettext('common.md5_report_type_mapping.download'), 229 'broken': gettext('common.md5_report_type_mapping.broken'), 230 'pages': gettext('common.md5_report_type_mapping.pages'), 231 'spam': gettext('common.md5_report_type_mapping.spam'), 232 'copyright': gettext('common.md5_report_type_mapping.copyright'), 233 'other': gettext('common.md5_report_type_mapping.other'), 234 } 235 236 def donation_id_to_receipt_id(donation_id): 237 return shortuuid.ShortUUID(alphabet="23456789abcdefghijkmnopqrstuvwxyz").encode(shortuuid.decode(donation_id)) 238 239 def receipt_id_to_donation_id(receipt_id): 240 return shortuuid.encode(shortuuid.ShortUUID(alphabet="23456789abcdefghijkmnopqrstuvwxyz").decode(receipt_id)) 241 242 @cachetools.cached(cache=cachetools.TTLCache(maxsize=1024, ttl=6*60*60)) 243 def usd_currency_rates_cached(): 244 # try: 245 # return forex_python.converter.CurrencyRates().get_rates('USD') 246 # except forex_python.converter.RatesNotAvailableError: 247 # print("RatesNotAvailableError -- using fallback!") 248 # # 2023-05-04 fallback 249 return {'EUR': 0.9161704076958315, 'JPY': 131.46129180027486, 'BGN': 1.7918460833715073, 'CZK': 21.44663307375172, 'DKK': 6.8263857077416406, 'GBP': 0.8016032982134678, 'HUF': 344.57169033440226, 'PLN': 4.293449381584975, 'RON': 4.52304168575355, 'SEK': 10.432890517636281, 'CHF': 0.9049931287219424, 'ISK': 137.15071003206597, 'NOK': 10.43105817682089, 'TRY': 19.25744388456253, 'AUD': 1.4944571690334403, 'BRL': 5.047732478240953, 'CAD': 1.3471369674759506, 'CNY': 6.8725606962895105, 'HKD': 7.849931287219422, 'IDR': 14924.993128721942, 'INR': 81.87402656894183, 'KRW': 1318.1951442968393, 'MXN': 18.288960146587264, 'MYR': 4.398992212551534, 'NZD': 1.592945487860742, 'PHP': 54.56894182317912, 'SGD': 1.3290884104443428, 'THB': 34.054970224461755, 'ZAR': 18.225286303252407} 250 251 @functools.cache 252 def membership_tier_names(locale): 253 with force_locale(locale): 254 return { 255 "1": gettext('common.membership.tier_name.bonus'), 256 "2": gettext('common.membership.tier_name.2'), 257 "3": gettext('common.membership.tier_name.3'), 258 "4": gettext('common.membership.tier_name.4'), 259 "5": gettext('common.membership.tier_name.5'), 260 } 261 262 MEMBERSHIP_TIER_COSTS = { 263 "2": 5, "3": 10, "4": 30, "5": 100, 264 } 265 MEMBERSHIP_METHOD_DISCOUNTS = { 266 # Note: keep manually in sync with HTML. 267 # "crypto": 20, 268 # "payment2": 20, 269 # # "cc": 20, 270 # "binance": 20, 271 # "paypal": 20, 272 # "payment2paypal": 20, 273 # "payment2cc": 20, 274 # "payment2cashapp": 20, 275 276 "crypto": 0, 277 "payment2": 0, 278 # "cc": 0, 279 "binance": 0, 280 "paypal": 0, 281 "payment2paypal": 0, 282 "payment2cc": 0, 283 "payment2cashapp": 0, 284 285 "paypalreg": 0, 286 "amazon": 0, 287 # "bmc": 0, 288 # "alipay": 0, 289 # "pix": 0, 290 "payment1": 0, 291 "payment1_alipay": 0, 292 "payment1_wechat": 0, 293 "payment1b": 0, 294 "payment1bb": 0, 295 "givebutter": 0, 296 "hoodpay": 0, 297 } 298 MEMBERSHIP_DURATION_DISCOUNTS = { 299 # Note: keep manually in sync with HTML. 300 "1": 0, "3": 5, "6": 10, "12": 15, "24": 25, 301 } 302 MEMBERSHIP_DOWNLOADS_PER_DAY = { 303 "1": 0, "2": 20, "3": 50, "4": 100, "5": 1000, 304 } 305 # Keep in sync. 306 MEMBERSHIP_BONUSDOWNLOADS_PER_DAY = { 307 "1": 0, "2": 10, "3": 25, "4": 50, "5": 500, 308 } 309 MEMBERSHIP_TELEGRAM_URL = { 310 "1": "", "2": "", "3": "", "4": MEMBERS_TELEGRAM_URL, "5": MEMBERS_TELEGRAM_URL, 311 } 312 MEMBERSHIP_METHOD_MINIMUM_CENTS_USD = { 313 "crypto": 0, 314 "payment2": 0, 315 # "cc": 20, 316 "binance": 0, 317 "paypal": 3500, 318 "payment2paypal": 1500, 319 "payment2cashapp": 0, 320 "payment2cc": 0, 321 "paypalreg": 0, 322 "amazon": 1000, 323 # "bmc": 0, 324 # "alipay": 0, 325 # "pix": 0, 326 "payment1": 1000, 327 "payment1_alipay": 1000, 328 "payment1_wechat": 1000, 329 "payment1b": 1000, 330 "payment1bb": 1000, 331 "givebutter": 500, 332 "hoodpay": 1000, 333 } 334 MEMBERSHIP_METHOD_MAXIMUM_CENTS_NATIVE = { 335 # "payment1": 30000, 336 "payment1b": 100000, 337 "payment1bb": 100000, 338 "amazon": 10000, 339 } 340 MEMBERSHIP_MAX_BONUS_DOWNLOADS = 10000 341 342 def get_account_fast_download_info(mariapersist_session, account_id): 343 mariapersist_session.connection().connection.ping(reconnect=True) 344 cursor = mariapersist_session.connection().connection.cursor(pymysql.cursors.DictCursor) 345 cursor.execute('SELECT mariapersist_memberships.membership_tier AS membership_tier, mariapersist_memberships.bonus_downloads AS bonus_downloads FROM mariapersist_accounts INNER JOIN mariapersist_memberships USING (account_id) WHERE mariapersist_accounts.account_id = %(account_id)s AND mariapersist_memberships.membership_expiration >= CURDATE()', { 'account_id': account_id }) 346 memberships = cursor.fetchall() 347 if len(memberships) == 0: 348 return None 349 350 downloads_per_day = 0 351 bonus_downloads = 0 352 for membership in memberships: 353 downloads_per_day += MEMBERSHIP_DOWNLOADS_PER_DAY[membership['membership_tier']] 354 bonus_downloads += membership['bonus_downloads'] 355 356 if bonus_downloads > MEMBERSHIP_MAX_BONUS_DOWNLOADS: 357 bonus_downloads = MEMBERSHIP_MAX_BONUS_DOWNLOADS 358 downloads_per_day += bonus_downloads 359 360 downloads_left = downloads_per_day 361 recently_downloaded_md5s = [md5.hex() for md5 in mariapersist_session.connection().execute(select(MariapersistFastDownloadAccess.md5).where((MariapersistFastDownloadAccess.timestamp >= datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=1)) & (MariapersistFastDownloadAccess.account_id == account_id)).limit(10000)).scalars()] 362 downloads_left -= len(recently_downloaded_md5s) 363 364 max_tier = str(max([int(membership['membership_tier']) for membership in memberships])) 365 366 return { 'downloads_left': max(0, downloads_left), 'recently_downloaded_md5s': recently_downloaded_md5s, 'downloads_per_day': downloads_per_day, 'telegram_url': MEMBERSHIP_TELEGRAM_URL[max_tier] } 367 368 def get_referral_account_id(mariapersist_session, potential_ref_account_id, current_account_id): 369 if potential_ref_account_id is None: 370 return None 371 if potential_ref_account_id == current_account_id: 372 return None 373 if account_can_make_referrals(mariapersist_session, current_account_id): 374 return potential_ref_account_id 375 else: 376 return None 377 378 def account_can_make_referrals(mariapersist_session, account_id): 379 mariapersist_session.connection().connection.ping(reconnect=True) 380 cursor = mariapersist_session.connection().connection.cursor(pymysql.cursors.DictCursor) 381 # Note the mariapersist_memberships.membership_tier >= 2 so we don't count bonus memberships. 382 cursor.execute('SELECT COUNT(*) AS count FROM mariapersist_accounts INNER JOIN mariapersist_memberships USING (account_id) WHERE mariapersist_accounts.account_id = %(account_id)s AND mariapersist_memberships.membership_expiration >= CURDATE() AND mariapersist_memberships.membership_tier >= 2', { 'account_id': account_id }) 383 return (cursor.fetchone()['count'] > 0) 384 385 def cents_to_usd_str(cents): 386 return str(cents)[:-2] + "." + str(cents)[-2:] 387 388 def format_currency(cost_cents_native_currency, native_currency_code, locale): 389 output = babel.numbers.format_currency(cost_cents_native_currency / 100, native_currency_code, locale=locale) 390 if output.endswith('.00') or output.endswith(',00'): 391 output = output[0:-3] 392 return output 393 394 def membership_format_native_currency(locale, native_currency_code, cost_cents_native_currency, cost_cents_usd): 395 with force_locale(locale): 396 if native_currency_code != 'USD': 397 return { 398 'cost_cents_native_currency_str_calculator': gettext('common.membership.format_currency.total_with_usd', amount=format_currency(cost_cents_native_currency, native_currency_code, locale), amount_usd=format_currency(cost_cents_usd, 'USD', locale)), 399 'cost_cents_native_currency_str_button': f"{format_currency(cost_cents_native_currency, native_currency_code, locale)}", 400 'cost_cents_native_currency_str_donation_page_formal': gettext('common.membership.format_currency.amount_with_usd', amount=format_currency(cost_cents_native_currency, native_currency_code, locale), amount_usd=format_currency(cost_cents_usd, 'USD', locale)), 401 'cost_cents_native_currency_str_donation_page_instructions': gettext('common.membership.format_currency.amount_with_usd', amount=format_currency(cost_cents_native_currency, native_currency_code, locale), amount_usd=format_currency(cost_cents_usd, 'USD', locale)), 402 } 403 # elif native_currency_code == 'COFFEE': 404 # return { 405 # 'cost_cents_native_currency_str_calculator': f"{format_currency(cost_cents_native_currency * 5, 'USD', locale)} ({cost_cents_native_currency} ☕️) total", 406 # 'cost_cents_native_currency_str_button': f"{format_currency(cost_cents_native_currency * 5, 'USD', locale)}", 407 # 'cost_cents_native_currency_str_donation_page_formal': f"{format_currency(cost_cents_native_currency * 5, 'USD', locale)} ({cost_cents_native_currency} ☕️)", 408 # 'cost_cents_native_currency_str_donation_page_instructions': f"{cost_cents_native_currency} “coffee” ({format_currency(cost_cents_native_currency * 5, 'USD', locale)})", 409 # } 410 else: 411 return { 412 'cost_cents_native_currency_str_calculator': gettext('common.membership.format_currency.total', amount=format_currency(cost_cents_usd, 'USD', locale)), 413 'cost_cents_native_currency_str_button': f"{format_currency(cost_cents_native_currency, 'USD', locale)}", 414 'cost_cents_native_currency_str_donation_page_formal': f"{format_currency(cost_cents_native_currency, 'USD', locale)}", 415 'cost_cents_native_currency_str_donation_page_instructions': f"{format_currency(cost_cents_native_currency, 'USD', locale)}", 416 } 417 418 @cachetools.cached(cache=cachetools.TTLCache(maxsize=1024, ttl=60*60)) 419 def membership_costs_data(locale): 420 usd_currency_rates = usd_currency_rates_cached() 421 422 def calculate_membership_costs(inputs): 423 tier = inputs['tier'] 424 method = inputs['method'] 425 duration = inputs['duration'] 426 if (tier not in MEMBERSHIP_TIER_COSTS.keys()) or (method not in MEMBERSHIP_METHOD_DISCOUNTS.keys()) or (duration not in MEMBERSHIP_DURATION_DISCOUNTS.keys()): 427 raise Exception("Invalid fields") 428 429 discounts = MEMBERSHIP_METHOD_DISCOUNTS[method] + MEMBERSHIP_DURATION_DISCOUNTS[duration] 430 monthly_cents = round(MEMBERSHIP_TIER_COSTS[tier]*(100-discounts)); 431 cost_cents_usd = monthly_cents * int(duration); 432 433 native_currency_code = 'USD' 434 cost_cents_native_currency = cost_cents_usd 435 if method in ['alipay', 'payment1', 'payment1_alipay', 'payment1_wechat', 'payment1b', 'payment1bb']: 436 native_currency_code = 'CNY' 437 cost_cents_native_currency = math.floor(cost_cents_usd * 7 / 100) * 100 438 # elif method == 'bmc': 439 # native_currency_code = 'COFFEE' 440 # cost_cents_native_currency = round(cost_cents_usd / 500) 441 elif method == 'amazon': 442 if cost_cents_usd <= 500: 443 cost_cents_usd = 500 444 elif cost_cents_usd <= 1000: 445 cost_cents_usd = 1000 446 elif cost_cents_usd <= 1500: 447 cost_cents_usd = 1500 448 elif cost_cents_usd <= 2000: 449 cost_cents_usd = 2000 450 elif cost_cents_usd <= 2700: 451 cost_cents_usd = 2500 452 elif cost_cents_usd == 5100: 453 cost_cents_usd = 4500 454 elif cost_cents_usd == 5400: 455 cost_cents_usd = 5500 456 elif cost_cents_usd == 8550: 457 cost_cents_usd = 8500 458 elif cost_cents_usd == 9000: 459 cost_cents_usd = 8500 460 elif cost_cents_usd == 30600: 461 cost_cents_usd = 30000 462 elif cost_cents_usd <= 100000: 463 cost_cents_usd = round(cost_cents_usd / 1000) * 1000 464 elif cost_cents_usd <= 200000: 465 cost_cents_usd = math.ceil(cost_cents_usd / 5000) * 5000 466 else: 467 cost_cents_usd = math.ceil(cost_cents_usd / 10000) * 10000 468 cost_cents_native_currency = cost_cents_usd 469 elif method == 'pix': 470 native_currency_code = 'BRL' 471 cost_cents_native_currency = round(cost_cents_usd * usd_currency_rates['BRL'] / 100) * 100 472 473 formatted_native_currency = membership_format_native_currency(locale, native_currency_code, cost_cents_native_currency, cost_cents_usd) 474 475 return { 476 'cost_cents_usd': cost_cents_usd, 477 'cost_cents_usd_str': babel.numbers.format_currency(cost_cents_usd / 100.0, 'USD', locale=locale), 478 'cost_cents_native_currency': cost_cents_native_currency, 479 'cost_cents_native_currency_str_calculator': formatted_native_currency['cost_cents_native_currency_str_calculator'], 480 'cost_cents_native_currency_str_button': formatted_native_currency['cost_cents_native_currency_str_button'], 481 'native_currency_code': native_currency_code, 482 'monthly_cents': monthly_cents, 483 'monthly_cents_str': babel.numbers.format_currency(monthly_cents / 100.0, 'USD', locale=locale), 484 'discounts': discounts, 485 'duration': duration, 486 'tier_name': membership_tier_names(locale)[tier], 487 } 488 489 data = {} 490 for tier in MEMBERSHIP_TIER_COSTS.keys(): 491 for method in MEMBERSHIP_METHOD_DISCOUNTS.keys(): 492 for duration in MEMBERSHIP_DURATION_DISCOUNTS.keys(): 493 inputs = { 'tier': tier, 'method': method, 'duration': duration } 494 data[f"{tier},{method},{duration}"] = calculate_membership_costs(inputs) 495 return data 496 497 498 # Keep in sync. 499 def confirm_membership(cursor, donation_id, data_key, data_value): 500 cursor.execute('SELECT * FROM mariapersist_donations WHERE donation_id=%(donation_id)s LIMIT 1', { 'donation_id': donation_id }) 501 donation = cursor.fetchone() 502 if donation is None: 503 print(f"Warning: failed {data_key} request because of donation not found: {donation_id}") 504 return False 505 if donation['processing_status'] == 1: 506 # Already confirmed 507 return True 508 if donation['processing_status'] not in [0, 2, 4]: 509 print(f"Warning: failed {data_key} request because processing_status != 0,2,4: {donation_id}") 510 return False 511 # # Allow for 10% margin 512 # if float(data['money']) * 110 < donation['cost_cents_native_currency']: 513 # print(f"Warning: failed {data_key} request of 'money' being too small: {data}") 514 # return False 515 516 donation_json = orjson.loads(donation['json']) 517 if donation_json['method'] not in ['payment1', 'payment1_alipay', 'payment1_wechat', 'payment1b', 'payment1bb', 'payment2', 'payment2paypal', 'payment2cashapp', 'payment2cc', 'amazon', 'hoodpay']: 518 print(f"Warning: failed {data_key} request because method is not valid: {donation_id}") 519 return False 520 521 cursor.execute('SELECT * FROM mariapersist_accounts WHERE account_id=%(account_id)s LIMIT 1', { 'account_id': donation['account_id'] }) 522 account = cursor.fetchone() 523 if account is None: 524 print(f"Warning: failed {data_key} request because of account not found: {donation_id}") 525 return False 526 527 new_tier = int(donation_json['tier']) 528 datetime_today = datetime.datetime.combine(datetime.datetime.utcnow().date(), datetime.datetime.min.time()) 529 new_membership_expiration = datetime_today + datetime.timedelta(days=1) + datetime.timedelta(days=31*int(donation_json['duration'])) 530 531 ref_account_id = donation_json.get('ref_account_id') 532 ref_account_dict = None 533 bonus_downloads = 0 534 if ref_account_id is not None: 535 cursor.execute('SELECT * FROM mariapersist_accounts WHERE account_id=%(account_id)s LIMIT 1', { 'account_id': ref_account_id }) 536 ref_account_dict = cursor.fetchone() 537 if ref_account_dict is None: 538 print(f"Warning: failed {data_key} request because of ref_account_dict not found: {donation_id}") 539 return False 540 bonus_downloads = MEMBERSHIP_BONUSDOWNLOADS_PER_DAY[str(new_tier)] 541 542 donation_json[data_key] = data_value 543 cursor.execute('INSERT INTO mariapersist_memberships (account_id, membership_tier, membership_expiration, from_donation_id, bonus_downloads) VALUES (%(account_id)s, %(membership_tier)s, %(membership_expiration)s, %(donation_id)s, %(bonus_downloads)s)', { 'membership_tier': new_tier, 'membership_expiration': new_membership_expiration, 'account_id': donation['account_id'], 'donation_id': donation_id, 'bonus_downloads': bonus_downloads }) 544 if (ref_account_dict is not None) and (bonus_downloads > 0): 545 cursor.execute('INSERT INTO mariapersist_memberships (account_id, membership_tier, membership_expiration, from_donation_id, bonus_downloads) VALUES (%(account_id)s, 1, %(membership_expiration)s, %(donation_id)s, %(bonus_downloads)s)', { 'membership_expiration': new_membership_expiration, 'account_id': ref_account_dict['account_id'], 'donation_id': donation_id, 'bonus_downloads': bonus_downloads }) 546 cursor.execute('UPDATE mariapersist_donations SET json=%(json)s, processing_status=1, paid_timestamp=NOW() WHERE donation_id = %(donation_id)s LIMIT 1', { 'donation_id': donation_id, 'json': orjson.dumps(donation_json) }) 547 cursor.execute('COMMIT') 548 return True 549 550 551 def payment2_check(cursor, payment_id): 552 payment2_status = None 553 for attempt in [1,2,3]: 554 try: 555 payment2_request = httpx.get(f"{PAYMENT2_URL}{payment_id}", headers={'x-api-key': PAYMENT2_API_KEY}, proxies=PAYMENT2_PROXIES, timeout=10.0) 556 payment2_request.raise_for_status() 557 payment2_status = payment2_request.json() 558 break 559 except: 560 if attempt == 3: 561 raise 562 if payment2_status['payment_status'] in ['confirmed', 'sending', 'finished']: 563 if confirm_membership(cursor, payment2_status['order_id'], 'payment2_status', payment2_status): 564 return (payment2_status, True) 565 else: 566 return (payment2_status, False) 567 return (payment2_status, True) 568 569 def hoodpay_check(cursor, hoodpay_id, donation_id): 570 hoodpay_status = httpx.get(HOODPAY_URL.split('/v1/businesses/', 1)[0] + '/v1/public/payments/hosted-page/' + hoodpay_id, headers={"Authorization": f"Bearer {HOODPAY_AUTH}"}, proxies=PAYMENT2_PROXIES, timeout=10.0).json()['data'] 571 if hoodpay_status['status'] in ['COMPLETED']: 572 if confirm_membership(cursor, donation_id, 'hoodpay_status', hoodpay_status): 573 return (hoodpay_status, True) 574 else: 575 return (hoodpay_status, False) 576 return (hoodpay_status, True) 577 578 def make_anon_download_uri(limit_multiple, speed_kbps, path, filename, domain): 579 limit_multiple_field = 'y' if limit_multiple else 'x' 580 expiry = int((datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(hours=6)).timestamp()) 581 secure_str = f"{domain}/{limit_multiple_field}/{expiry}/{speed_kbps}/{path},{DOWNLOADS_SECRET_KEY}" 582 md5 = base64.urlsafe_b64encode(hashlib.md5(secure_str.encode('utf-8')).digest()).decode('utf-8').rstrip('=') 583 return f"d3/{limit_multiple_field}/{expiry}/{speed_kbps}/{urllib.parse.quote(path)}~/{md5}/{filename}" 584 585 DICT_COMMENTS_NO_API_DISCLAIMER = "This page is *not* intended as an API. If you need programmatic access to this JSON, please set up your own instance. For more information, see: https://annas-archive.org/datasets and https://annas-software.org/AnnaArchivist/annas-archive/-/tree/main/data-imports" 586 587 COMMON_DICT_COMMENTS = { 588 "identifier": ("after", ["Typically ISBN-10 or ISBN-13."]), 589 "identifierwodash": ("after", ["Same as 'identifier' but without dashes."]), 590 "locator": ("after", ["Original filename or path on the Library Genesis servers."]), 591 "stripped_description": ("before", ["Anna's Archive version of the 'descr' or 'description' field, with HTML tags removed or replaced with regular whitespace."]), 592 "language_codes": ("before", ["Anna's Archive version of the 'language' field, where we attempted to parse it into BCP 47 tags."]), 593 "cover_url_normalized": ("after", ["Anna's Archive version of the 'coverurl' field, where we attempted to turn it into a full URL."]), 594 "edition_varia_normalized": ("after", ["Anna's Archive version of the 'series', 'volume', 'edition', 'periodical', and 'year' fields; combining them into a single field for display and search."]), 595 "topic_descr": ("after", ["A description of the 'topic' field using a separate database table, which seems to have its roots in the Kolxo3 library that Libgen was originally based on.", 596 "https://wiki.mhut.org/content:bibliographic_data says that this field will be deprecated in favor of Dewey Decimal."]), 597 "topic": ("after", ["See 'topic_descr' below."]), 598 "searchable": ("after", ["This seems to indicate that the book has been OCR'ed."]), 599 "generic": ("after", ["If this is set to a different md5, then that version is preferred over this one, and should be shown in search results instead."]), 600 "visible": ("after", ["If this is set, the book is in fact *not* visible in Libgen, and this string describes the reason."]), 601 "commentary": ("after", ["Comments left by the uploader, an admin, or an automated process."]), 602 "toc": ("before", ["Table of contents. May contain HTML."]), 603 "ddc": ("after", ["See also https://libgen.li/biblioservice.php?type=ddc"]), 604 "udc": ("after", ["See also https://libgen.li/biblioservice.php?type=udc"]), 605 "lbc": ("after", ["See also https://libgen.li/biblioservice.php?type=bbc and https://www.isko.org/cyclo/lbc"]), 606 "descriptions_mapped": ("before", ["Normalized fields by Anna's Archive, taken from the various `*_add_descr` Libgen.li tables, with comments taken from the `elem_descr` table which contain metadata about these fields, as well as sometimes our own metadata.", 607 "The names themselves are taken from `name_en` in the corresponding `elem_descr` entry (lowercased, whitespace removed), with `name_add{1,2,3}_en` to create the compound keys, such as `isbn_isbnnotes`."]), 608 "identifiers_unified": ("before", ["Anna's Archive version of various identity-related fields."]), 609 "classifications_unified": ("before", ["Anna's Archive version of various classification-related fields."]), 610 } 611 612 # Hardcoded from the `descr_elems` table. 613 LGLI_EDITION_TYPE_MAPPING = { 614 "b":"book", 615 "ch":"book-chapter", 616 "bpart":"book-part", 617 "bsect":"book-section", 618 "bs":"book-series", 619 "bset":"book-set", 620 "btrack":"book-track", 621 "component":"component", 622 "dataset":"dataset", 623 "diss":"dissertation", 624 "j":"journal", 625 "a":"journal-article", 626 "ji":"journal-issue", 627 "jv":"journal-volume", 628 "mon":"monograph", 629 "oth":"other", 630 "peer-review":"peer-review", 631 "posted-content":"posted-content", 632 "proc":"proceedings", 633 "proca":"proceedings-article", 634 "ref":"reference-book", 635 "refent":"reference-entry", 636 "rep":"report", 637 "repser":"report-series", 638 "s":"standard", 639 "fnz":"Fanzine", 640 "m":"Magazine issue", 641 "col":"Collection", 642 "chb":"Chapbook", 643 "nonfict":"Nonfiction", 644 "omni":"Omnibus", 645 "nov":"Novel", 646 "ant":"Anthology", 647 "c":"Comics issue", 648 } 649 LGLI_ISSUE_OTHER_FIELDS = [ 650 "issue_number_in_year", 651 "issue_year_number", 652 "issue_number", 653 "issue_volume", 654 "issue_split", 655 "issue_total_number", 656 "issue_first_page", 657 "issue_last_page", 658 "issue_year_end", 659 "issue_month_end", 660 "issue_day_end", 661 "issue_closed", 662 ] 663 LGLI_STANDARD_INFO_FIELDS = [ 664 "standardtype", 665 "standardtype_standartnumber", 666 "standardtype_standartdate", 667 "standartnumber", 668 "standartstatus", 669 "standartstatus_additionalstandartstatus", 670 ] 671 LGLI_DATE_INFO_FIELDS = [ 672 "datepublication", 673 "dateintroduction", 674 "dateactualizationtext", 675 "dateregistration", 676 "dateactualizationdescr", 677 "dateexpiration", 678 "datelastedition", 679 ] 680 # Hardcoded from the `libgenli_elem_descr` table. 681 LGLI_IDENTIFIERS = { 682 "asin": { "label": "ASIN", "url": "https://www.amazon.com/dp/%s", "description": "Amazon Standard Identification Number"}, 683 "audibleasin": { "label": "Audible-ASIN", "url": "https://www.audible.com/pd/%s", "description": "Audible ASIN"}, 684 "bl": { "label": "BL", "url": "http://explore.bl.uk/primo_library/libweb/action/dlDisplay.do?vid=BLVU1&docId=BLL01%s", "description": "The British Library"}, 685 "bleilerearlyyears": { "label": "Bleiler Early Years", "url": "", "description": "Richard Bleiler, Everett F. Bleiler. Science-Fiction: The Early Years. Kent State University Press, 1991, xxiii+998 p."}, 686 "bleilergernsback": { "label": "Bleiler Gernsback", "url": "", "description": "Everett F. Bleiler, Richard Bleiler. Science-Fiction: The Gernsback Years. Kent State University Press, 1998, xxxii+730pp"}, 687 "bleilersupernatural": { "label": "Bleiler Supernatural", "url": "", "description": "Everett F. Bleiler. The Guide to Supernatural Fiction. Kent State University Press, 1983, xii+723 p."}, 688 "bn": { "label": "BN", "url": "http://www.barnesandnoble.com/s/%s", "description": "Barnes and Noble"}, 689 "bnb": { "label": "BNB", "url": "http://search.bl.uk/primo_library/libweb/action/search.do?fn=search&vl(freeText0)=%s", "description": "The British National Bibliography"}, 690 "bnf": { "label": "BNF", "url": "http://catalogue.bnf.fr/ark:/12148/%s", "description": "Bibliotheque nationale de France"}, 691 "coollibbookid": { "label": "Coollib", "url": "https://coollib.ru/b/%s", "description":""}, 692 "copac": { "label": "COPAC", "url": "http://copac.jisc.ac.uk/id/%s?style=html", "description": "UK/Irish union catalog"}, 693 "crossrefbookid": { "label": "Crossref", "url": "https://data.crossref.org/depositorreport?pubid=%s", "description":""}, 694 "dnb": { "label": "DNB", "url": "http://d-nb.info/%s", "description": "Deutsche Nationalbibliothek"}, 695 "fantlabeditionid": { "label": "FantLab Edition ID", "url": "https://fantlab.ru/edition%s", "description": "Лаболатория фантастики"}, 696 "flibustabookid": { "label": "Flibusta", "url": "https://flibusta.is/b/%s", "description":""}, 697 "goodreads": { "label": "Goodreads", "url": "http://www.goodreads.com/book/show/%s", "description": "Goodreads social cataloging site"}, 698 "googlebookid": { "label": "Google Books", "url": "https://books.google.com/books?id=%s", "description": ""}, 699 "isfdbpubideditions": { "label": "ISFDB (editions)", "url": "http://www.isfdb.org/cgi-bin/pl.cgi?%s", "description": ""}, 700 "issn": { "label": "ISSN", "url": "https://urn.issn.org/urn:issn:%s", "description": "International Standard Serial Number"}, 701 "jnbjpno": { "label": "JNB/JPNO", "url": "https://iss.ndl.go.jp/api/openurl?ndl_jpno=%s&locale=en", "description": "The Japanese National Bibliography"}, 702 "jstorstableid": { "label": "JSTOR Stable", "url": "https://www.jstor.org/stable/%s", "description": ""}, 703 "kbr": { "label": "KBR", "url": "https://opac.kbr.be/Library/doc/SYRACUSE/%s/", "description": "De Belgische Bibliografie/La Bibliographie de Belgique"}, 704 "lccn": { "label": "LCCN", "url": "http://lccn.loc.gov/%s", "description": "Library of Congress Control Number"}, 705 "librusecbookid": { "label": "Librusec", "url": "https://lib.rus.ec/b/%s", "description":""}, 706 "litmirbookid": { "label": "Litmir", "url": "https://www.litmir.me/bd/?b=%s", "description":""}, 707 "ltf": { "label": "LTF", "url": "http://www.tercerafundacion.net/biblioteca/ver/libro/%s", "description": "La Tercera Fundación"}, 708 "maximabookid": { "label": "Maxima", "url": "http://maxima-library.org/mob/b/%s", "description":""}, 709 "ndl": { "label": "NDL", "url": "http://id.ndl.go.jp/bib/%s/eng", "description": "National Diet Library"}, 710 "nilf": { "label": "NILF", "url": "http://nilf.it/%s/", "description": "Numero Identificativo della Letteratura Fantastica / Fantascienza"}, 711 "nla": { "label": "NLA", "url": "https://nla.gov.au/nla.cat-vn%s", "description": "National Library of Australia"}, 712 "noosfere": { "label": "NooSFere", "url": "https://www.noosfere.org/livres/niourf.asp?numlivre=%s", "description": "NooSFere"}, 713 "oclcworldcat": { "label": "OCLC/WorldCat", "url": "https://www.worldcat.org/oclc/%s", "description": "Online Computer Library Center"}, 714 "openlibrary": { "label": "Open Library", "url": "https://openlibrary.org/books/%s", "description": ""}, 715 "pii": { "label": "PII", "url": "", "description": "Publisher Item Identifier", "website": "https://en.wikipedia.org/wiki/Publisher_Item_Identifier"}, 716 "pmcid": { "label": "PMC ID", "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/%s/", "description": "PubMed Central ID"}, 717 "pmid": { "label": "PMID", "url": "https://pubmed.ncbi.nlm.nih.gov/%s/", "description": "PubMed ID"}, 718 "porbase": { "label": "PORBASE", "url": "http://id.bnportugal.gov.pt/bib/porbase/%s", "description": "Biblioteca Nacional de Portugal"}, 719 "ppn": { "label": "PPN", "url": "http://picarta.pica.nl/xslt/DB=3.9/XMLPRS=Y/PPN?PPN=%s", "description": "De Nederlandse Bibliografie Pica Productie Nummer"}, 720 "reginald1": { "label": "Reginald-1", "url": "", "description": "R. Reginald. Science Fiction and Fantasy Literature: A Checklist, 1700-1974, with Contemporary Science Fiction Authors II. Gale Research Co., 1979, 1141p."}, 721 "reginald3": { "label": "Reginald-3", "url": "", "description": "Robert Reginald. Science Fiction and Fantasy Literature, 1975-1991: A Bibliography of Science Fiction, Fantasy, and Horror Fiction Books and Nonfiction Monographs. Gale Research Inc., 1992, 1512 p."}, 722 "sfbg": { "label": "SFBG", "url": "http://www.sfbg.us/book/%s", "description": "Catalog of books published in Bulgaria"}, 723 "sfleihbuch": { "label": "SF-Leihbuch", "url": "http://www.sf-leihbuch.de/index.cfm?bid=%s", "description": "Science Fiction-Leihbuch-Datenbank"}, 724 } 725 # Hardcoded from the `libgenli_elem_descr` table. 726 LGLI_CLASSIFICATIONS = { 727 "classification": { "label": "Classification", "url": "", "description": "" }, 728 "classificationokp": { "label": "OKP", "url": "https://classifikators.ru/okp/%s", "description": "" }, 729 "classificationgostgroup": { "label": "GOST group", "url": "", "description": "", "website": "https://en.wikipedia.org/wiki/GOST" }, 730 "classificationoks": { "label": "OKS", "url": "", "description": "" }, 731 "libraryofcongressclassification": { "label": "LCC", "url": "https://catalog.loc.gov/vwebv/search?searchCode=CALL%2B&searchArg=%s&searchType=1&limitTo=none&fromYear=&toYear=&limitTo=LOCA%3Dall&limitTo=PLAC%3Dall&limitTo=TYPE%3Dall&limitTo=LANG%3Dall&recCount=25", "description": "Library of Congress Classification", "website": "https://en.wikipedia.org/wiki/Library_of_Congress_Classification" }, 732 "udc": { "label": "UDC", "url": "https://libgen.li/biblioservice.php?value=%s&type=udc", "description": "Universal Decimal Classification", "website": "https://en.wikipedia.org/wiki/Universal_Decimal_Classification" }, 733 "ddc": { "label": "DDC", "url": "https://libgen.li/biblioservice.php?value=%s&type=ddc", "description": "Dewey Decimal", "website": "https://en.wikipedia.org/wiki/List_of_Dewey_Decimal_classes" }, 734 "lbc": { "label": "LBC", "url": "https://libgen.li/biblioservice.php?value=%s&type=bbc", "description": "Library-Bibliographical Classification", "website": "https://www.isko.org/cyclo/lbc" }, 735 } 736 LGLI_IDENTIFIERS_MAPPING = { 737 "oclcworldcat": "oclc", 738 "openlibrary": "ol", 739 "googlebookid": "gbook", 740 } 741 LGLI_CLASSIFICATIONS_MAPPING = { 742 "classification": "class", 743 "classificationokp": "okp", 744 "classificationgostgroup": "gost", 745 "classificationoks": "oks", 746 "libraryofcongressclassification": "lcc", 747 } 748 749 LGRS_TO_UNIFIED_IDENTIFIERS_MAPPING = { 750 'asin': 'asin', 751 'googlebookid': 'gbook', 752 'openlibraryid': 'ol', 753 'doi': 'doi', 754 'issn': 'issn', 755 } 756 LGRS_TO_UNIFIED_CLASSIFICATIONS_MAPPING = { 757 'udc': 'udc', 758 'ddc': 'ddc', 759 'lbc': 'lbc', 760 'lcc': 'lcc', 761 } 762 763 UNIFIED_IDENTIFIERS = { 764 "isbn10": { "label": "ISBN-10", "url": "https://en.wikipedia.org/wiki/Special:BookSources?isbn=%s", "description": "" }, 765 "isbn13": { "label": "ISBN-13", "url": "https://en.wikipedia.org/wiki/Special:BookSources?isbn=%s", "description": "" }, 766 "doi": { "label": "DOI", "url": "https://doi.org/%s", "description": "Digital Object Identifier" }, 767 "lgrsnf": { "label": "Libgen.rs Non-Fiction", "url": "https://libgen.rs/json.php?fields=*&ids=%s", "description": "" }, 768 "lgrsfic": { "label": "Libgen.rs Fiction", "url": "https://libgen.rs/fiction/", "description": "" }, 769 "lgli": { "label": "Libgen.li File", "url": "https://libgen.li/file.php?id=%s", "description": "" }, 770 "zlib": { "label": "Z-Library", "url": "https://1lib.sk", "description": "" }, 771 # TODO: Add URL/description for these. 772 "csbn": { "label": "CSBN", "url": "", "description": "" }, 773 "ean13": { "label": "EAN-13", "url": "", "description": "" }, 774 "duxiu_ssid": { "label": "DuXiu SSID", "url": "", "description": "" }, 775 "duxiu_dxid": { "label": "DuXiu DXID", "url": "", "description": "" }, 776 "cadal_ssno": { "label": "CADAL SSNO", "url": "", "description": "" }, 777 **{LGLI_IDENTIFIERS_MAPPING.get(key, key): value for key, value in LGLI_IDENTIFIERS.items()}, 778 # Plus more added below! 779 } 780 UNIFIED_CLASSIFICATIONS = { 781 **{LGLI_CLASSIFICATIONS_MAPPING.get(key, key): value for key, value in LGLI_CLASSIFICATIONS.items()}, 782 # Plus more added below! 783 } 784 785 OPENLIB_TO_UNIFIED_IDENTIFIERS_MAPPING = { 786 'amazon': 'asin', 787 'amazon.co.uk_asin': 'asin', 788 'amazon.ca_asin': 'asin', 789 'amazon.de_asin': 'asin', 790 'amazon.it_asin': 'asin', 791 'amazon.co.jp_asin': 'asin', 792 'british_library': 'bl', 793 'british_national_bibliography': 'bnb', 794 'google': 'gbook', 795 'isbn_10': 'isbn10', 796 'isbn_13': 'isbn13', 797 'national_diet_library,_japan': 'ndl', 798 'oclc_numbers': 'oclc', 799 'isfdb': 'isfdbpubideditions', 800 'lccn_permalink': 'lccn', 801 'library_of_congress': 'lccn', 802 'library_of_congress_catalogue_number': 'lccn', 803 'library_of_congress_catalog_no.': 'lccn', 804 'abebooks,de': 'abebooks.de', 805 'bibliothèque_nationale_de_france_(bnf)': 'bibliothèque_nationale_de_france', 806 'harvard_university_library': 'harvard', 807 'gallica_(bnf)': 'bibliothèque_nationale_de_france', 808 'depósito_legal_n.a.': 'depósito_legal', 809 **{key: key for key in UNIFIED_IDENTIFIERS.keys()}, 810 # Plus more added below! 811 } 812 OPENLIB_TO_UNIFIED_CLASSIFICATIONS_MAPPING = { 813 'dewey_decimal_class': 'ddc', 814 'dewey_number': 'ddc', 815 'lc_classifications': 'lcc', 816 'library_bibliographical_classification': 'lbc', 817 'udc': 'udc', 818 'library_of_congress_classification_(lcc)': 'lcc', 819 'dewey_decimal_classification_(ddc)': 'ddc', 820 **{key: key for key in UNIFIED_CLASSIFICATIONS.keys()}, 821 # Plus more added below! 822 } 823 # Hardcoded labels for OL. The "label" fields in ol_edition.json become "description" instead. 824 OPENLIB_LABELS = { 825 "abaa": "ABAA", 826 "abebooks.de": "Abebooks", 827 "abwa_bibliographic_number": "ABWA", 828 "alibris_id": "Alibris", 829 "bayerische_staatsbibliothek": "BSB-ID", 830 "bcid": "BCID", 831 "better_world_books": "BWB", 832 "bhl": "BHL", 833 "bibliothèque_nationale_de_france": "BnF", 834 "bibsys": "Bibsys", 835 "bodleian,_oxford_university": "Bodleian", 836 "booklocker.com": "BookLocker", 837 "bookmooch": "Book Mooch", 838 "booksforyou": "Books For You", 839 "bookwire": "BookWire", 840 "boston_public_library": "BPL", 841 "canadian_national_library_archive": "CNLA", 842 "choosebooks": "Choosebooks", 843 "cornell_university_library": "Cornell", 844 "cornell_university_online_library": "Cornell", 845 "dc_books": "DC", 846 "depósito_legal": "Depósito Legal", 847 "digital_library_pomerania": "Pomerania", 848 "discovereads": "Discovereads", 849 "dnb": "DNB", 850 "dominican_institute_for_oriental_studies_library": "Al Kindi", 851 "etsc": "ETSC", 852 "fennica": "Fennica", 853 "finnish_public_libraries_classification_system": "FPL", 854 "folio": "Folio", 855 "freebase": "Freebase", 856 "goethe_university_library,_frankfurt": "Goethe", 857 "goodreads": "Goodreads", 858 "grand_comics_database": "Grand Comics DB", 859 "harvard": "Harvard", 860 "hathi_trust": "Hathi", 861 "identificativo_sbn": "SBN", 862 "ilmiolibro": "Ilmiolibro", 863 "inducks": "INDUCKS", 864 "issn": "ISSN", 865 "istc": "ISTC", 866 "lccn": "LCCN", 867 "learnawesome": "LearnAwesome", 868 "library_and_archives_canada_cataloguing_in_publication": "CIP", 869 "librarything": "Library Thing", 870 "libris": "Libris", 871 "librivox": "LibriVox", 872 "lulu": "Lulu", 873 "magcloud": "Magcloud", 874 "nbuv": "NBUV", 875 "nla": "NLA", 876 "nur": "NUR", 877 "ocaid": "Internet Archive", 878 "openstax": "OpenStax", 879 "overdrive": "OverDrive", 880 "paperback_swap": "Paperback Swap", 881 "project_gutenberg": "Gutenberg", 882 "publishamerica": "PublishAmerica", 883 "rvk": "RVK", 884 "scribd": "Scribd", 885 "shelfari": "Shelfari", 886 "siso": "SISO", 887 "smashwords_book_download": "Smashwords", 888 "standard_ebooks": "Standard Ebooks", 889 "storygraph": "Storygraph", 890 "ulrls": "ULRLS", 891 "ulrls_classmark": "ULRLS Classmark", 892 "w._w._norton": "W.W.Norton", 893 "wikidata": "Wikidata", 894 "wikisource": "Wikisource", 895 "yakaboo": "Yakaboo", 896 "zdb-id": "ZDB-ID", 897 } 898 # Retrieved from https://openlibrary.org/config/edition.json on 2023-07-02 899 ol_edition_json = orjson.loads(open(os.path.dirname(os.path.realpath(__file__)) + '/page/ol_edition.json').read()) 900 for identifier in ol_edition_json['identifiers']: 901 if 'url' in identifier: 902 identifier['url'] = identifier['url'].replace('@@@', '%s') 903 unified_name = identifier['name'] 904 if unified_name in OPENLIB_TO_UNIFIED_IDENTIFIERS_MAPPING: 905 unified_name = OPENLIB_TO_UNIFIED_IDENTIFIERS_MAPPING[unified_name] 906 if unified_name not in UNIFIED_IDENTIFIERS: 907 raise Exception(f"unified_name '{unified_name}' should be in UNIFIED_IDENTIFIERS") 908 else: 909 OPENLIB_TO_UNIFIED_IDENTIFIERS_MAPPING[unified_name] = unified_name 910 if unified_name not in UNIFIED_IDENTIFIERS: 911 # If unified name is not in OPENLIB_TO_UNIFIED_*_MAPPING, then it *has* to be in OPENLIB_LABELS. 912 label = OPENLIB_LABELS[unified_name] 913 description = '' 914 if identifier.get('description', '') != label: 915 description = identifier.get('description', '') 916 UNIFIED_IDENTIFIERS[unified_name] = { **identifier, 'label': label, 'description': description } 917 for classification in ol_edition_json['classifications']: 918 if 'website' in classification: 919 classification['website'] = classification['website'].split(' ')[0] # Sometimes there's a suffix in text.. 920 unified_name = classification['name'] 921 if unified_name in OPENLIB_TO_UNIFIED_CLASSIFICATIONS_MAPPING: 922 unified_name = OPENLIB_TO_UNIFIED_CLASSIFICATIONS_MAPPING[unified_name] 923 if unified_name not in UNIFIED_CLASSIFICATIONS: 924 raise Exception(f"unified_name '{unified_name}' should be in UNIFIED_CLASSIFICATIONS") 925 else: 926 OPENLIB_TO_UNIFIED_CLASSIFICATIONS_MAPPING[unified_name] = unified_name 927 if unified_name not in UNIFIED_CLASSIFICATIONS: 928 # If unified name is not in OPENLIB_TO_UNIFIED_*_MAPPING, then it *has* to be in OPENLIB_LABELS. 929 label = OPENLIB_LABELS[unified_name] 930 description = '' 931 if classification.get('description', '') != label: 932 description = classification.get('description', '') 933 UNIFIED_CLASSIFICATIONS[unified_name] = { **classification, 'label': label, 'description': description } 934 935 def init_identifiers_and_classification_unified(output_dict): 936 if 'identifiers_unified' not in output_dict: 937 output_dict['identifiers_unified'] = {} 938 if 'classifications_unified' not in output_dict: 939 output_dict['classifications_unified'] = {} 940 941 def add_identifier_unified(output_dict, name, value): 942 if value is None: 943 print(f"Warning: 'None' found for add_identifier_unified {name}") 944 return 945 name = name.strip() 946 value = str(value).strip() 947 if name == 'lccn' and 'http://lccn.loc.gov/' in value: 948 value = value.replace('http://lccn.loc.gov/', '') # for lccn_permalink 949 value = value.split('/')[0] 950 if len(value) == 0: 951 return 952 unified_name = OPENLIB_TO_UNIFIED_IDENTIFIERS_MAPPING.get(name, name) 953 if unified_name in UNIFIED_IDENTIFIERS: 954 if unified_name not in output_dict['identifiers_unified']: 955 output_dict['identifiers_unified'][unified_name] = [] 956 if value not in output_dict['identifiers_unified'][unified_name]: 957 output_dict['identifiers_unified'][unified_name].append(value) 958 else: 959 print(f"Warning: Unknown identifier in add_identifier_unified: {name}") 960 961 def add_classification_unified(output_dict, name, value): 962 if value is None: 963 print(f"Warning: 'None' found for add_classification_unified {name}") 964 return 965 name = name.strip() 966 value = str(value).strip() 967 if len(value) == 0: 968 return 969 unified_name = OPENLIB_TO_UNIFIED_CLASSIFICATIONS_MAPPING.get(name, name) 970 if unified_name in UNIFIED_CLASSIFICATIONS: 971 if unified_name not in output_dict['classifications_unified']: 972 output_dict['classifications_unified'][unified_name] = [] 973 if value not in output_dict['classifications_unified'][unified_name]: 974 output_dict['classifications_unified'][unified_name].append(value) 975 else: 976 print(f"Warning: Unknown classification in add_classification_unified: {name}") 977 978 def normalize_isbn(string): 979 canonical_isbn13 = isbnlib.get_canonical_isbn(string, output='isbn13') 980 try: 981 if (not isbnlib.is_isbn10(isbnlib.to_isbn10(canonical_isbn13))) or len(canonical_isbn13) != 13 or len(isbnlib.info(canonical_isbn13)) == 0: 982 return '' 983 except: 984 return '' 985 return canonical_isbn13 986 987 def add_isbns_unified(output_dict, potential_isbns): 988 isbn10s = set() 989 isbn13s = set() 990 csbns = set() 991 for potential_isbn in potential_isbns: 992 if '·' in potential_isbn: 993 csbns.add(potential_isbn) 994 else: 995 isbn13 = normalize_isbn(potential_isbn) 996 if isbn13 != '': 997 isbn13s.add(isbn13) 998 isbn10 = isbnlib.to_isbn10(isbn13) 999 if isbnlib.is_isbn10(isbn10 or ''): 1000 isbn10s.add(isbn10) 1001 for isbn10 in isbn10s: 1002 add_identifier_unified(output_dict, 'isbn10', isbn10) 1003 for isbn13 in isbn13s: 1004 add_identifier_unified(output_dict, 'isbn13', isbn13) 1005 for csbn in csbns: 1006 add_identifier_unified(output_dict, 'csbn', csbn) 1007 1008 def merge_unified_fields(list_of_fields_unified): 1009 merged_sets = {} 1010 for fields_unified in list_of_fields_unified: 1011 for unified_name, values in fields_unified.items(): 1012 if unified_name not in merged_sets: 1013 merged_sets[unified_name] = set() 1014 for value in values: 1015 merged_sets[unified_name].add(value) 1016 return { unified_name: list(merged_set) for unified_name, merged_set in merged_sets.items() } 1017 1018 SEARCH_INDEX_SHORT_LONG_MAPPING = { 1019 '': 'aarecords', 1020 'journals': 'aarecords_journals', 1021 'digital_lending': 'aarecords_digital_lending', 1022 'meta': 'aarecords_metadata', 1023 } 1024 def get_aarecord_id_prefix_is_metadata(id_prefix): 1025 return (id_prefix in ['isbn', 'ol', 'oclc', 'duxiu_ssid', 'cadal_ssno']) 1026 def get_aarecord_search_indexes_for_id_prefix(id_prefix): 1027 if get_aarecord_id_prefix_is_metadata(id_prefix): 1028 return ['aarecords_metadata'] 1029 elif id_prefix == 'ia': 1030 return ['aarecords_digital_lending'] 1031 elif id_prefix in ['md5', 'doi']: 1032 return ['aarecords', 'aarecords_journals'] 1033 else: 1034 raise Exception(f"Unknown aarecord_id prefix: {aarecord_id}") 1035 def get_aarecord_search_index(id_prefix, content_type): 1036 if get_aarecord_id_prefix_is_metadata(id_prefix): 1037 return 'aarecords_metadata' 1038 elif id_prefix == 'ia': 1039 return 'aarecords_digital_lending' 1040 elif id_prefix in ['md5', 'doi']: 1041 if content_type == 'journal_article': 1042 return 'aarecords_journals' 1043 else: 1044 return 'aarecords' 1045 else: 1046 raise Exception(f"Unknown aarecord_id prefix: {aarecord_id}") 1047 SEARCH_INDEX_TO_ES_MAPPING = { 1048 'aarecords': es, 1049 'aarecords_journals': es, 1050 'aarecords_digital_lending': es_aux, 1051 'aarecords_metadata': es_aux, 1052 } 1053 # TODO: Look into https://discuss.elastic.co/t/score-and-relevance-across-the-shards/5371 1054 ES_VIRTUAL_SHARDS_NUM = 12 1055 def virtshard_for_hashed_aarecord_id(hashed_aarecord_id): 1056 return int.from_bytes(hashed_aarecord_id, byteorder='big', signed=False) % ES_VIRTUAL_SHARDS_NUM 1057 def virtshard_for_aarecord_id(aarecord_id): 1058 return virtshard_for_hashed_aarecord_id(hashlib.md5(aarecord_id.encode()).digest()) 1059 def all_virtshards_for_index(index_name): 1060 return [f'{index_name}__{virtshard}' for virtshard in range(0, ES_VIRTUAL_SHARDS_NUM)] 1061 1062 # TODO: translate? 1063 def marc_country_code_to_english(marc_country_code): 1064 marc_country_code = marc_country_code.strip() 1065 return MARC_COUNTRY_CODES.get(marc_country_code) or MARC_DEPRECATED_COUNTRY_CODES.get(marc_country_code) or marc_country_code 1066 1067 # From https://www.loc.gov/marc/countries/countries_code.html 1068 MARC_COUNTRY_CODES = { 1069 "aa" : "Albania", 1070 "abc" : "Alberta", 1071 "aca" : "Australian Capital Territory", 1072 "ae" : "Algeria", 1073 "af" : "Afghanistan", 1074 "ag" : "Argentina", 1075 "ai" : "Armenia (Republic)", 1076 "aj" : "Azerbaijan", 1077 "aku" : "Alaska", 1078 "alu" : "Alabama", 1079 "am" : "Anguilla", 1080 "an" : "Andorra", 1081 "ao" : "Angola", 1082 "aq" : "Antigua and Barbuda", 1083 "aru" : "Arkansas", 1084 "as" : "American Samoa", 1085 "at" : "Australia", 1086 "au" : "Austria", 1087 "aw" : "Aruba", 1088 "ay" : "Antarctica", 1089 "azu" : "Arizona", 1090 "ba" : "Bahrain", 1091 "bb" : "Barbados", 1092 "bcc" : "British Columbia", 1093 "bd" : "Burundi", 1094 "be" : "Belgium", 1095 "bf" : "Bahamas", 1096 "bg" : "Bangladesh", 1097 "bh" : "Belize", 1098 "bi" : "British Indian Ocean Territory", 1099 "bl" : "Brazil", 1100 "bm" : "Bermuda Islands", 1101 "bn" : "Bosnia and Herzegovina", 1102 "bo" : "Bolivia", 1103 "bp" : "Solomon Islands", 1104 "br" : "Burma", 1105 "bs" : "Botswana", 1106 "bt" : "Bhutan", 1107 "bu" : "Bulgaria", 1108 "bv" : "Bouvet Island", 1109 "bw" : "Belarus", 1110 "bx" : "Brunei", 1111 "ca" : "Caribbean Netherlands", 1112 "cau" : "California", 1113 "cb" : "Cambodia", 1114 "cc" : "China", 1115 "cd" : "Chad", 1116 "ce" : "Sri Lanka", 1117 "cf" : "Congo (Brazzaville)", 1118 "cg" : "Congo (Democratic Republic)", 1119 "ch" : "China (Republic : 1949- )", 1120 "ci" : "Croatia", 1121 "cj" : "Cayman Islands", 1122 "ck" : "Colombia", 1123 "cl" : "Chile", 1124 "cm" : "Cameroon", 1125 "co" : "Curaçao", 1126 "cou" : "Colorado", 1127 "cq" : "Comoros", 1128 "cr" : "Costa Rica", 1129 "ctu" : "Connecticut", 1130 "cu" : "Cuba", 1131 "cv" : "Cabo Verde", 1132 "cw" : "Cook Islands", 1133 "cx" : "Central African Republic", 1134 "cy" : "Cyprus", 1135 "dcu" : "District of Columbia", 1136 "deu" : "Delaware", 1137 "dk" : "Denmark", 1138 "dm" : "Benin", 1139 "dq" : "Dominica", 1140 "dr" : "Dominican Republic", 1141 "ea" : "Eritrea", 1142 "ec" : "Ecuador", 1143 "eg" : "Equatorial Guinea", 1144 "em" : "Timor-Leste", 1145 "enk" : "England", 1146 "er" : "Estonia", 1147 "es" : "El Salvador", 1148 "et" : "Ethiopia", 1149 "fa" : "Faroe Islands", 1150 "fg" : "French Guiana", 1151 "fi" : "Finland", 1152 "fj" : "Fiji", 1153 "fk" : "Falkland Islands", 1154 "flu" : "Florida", 1155 "fm" : "Micronesia (Federated States)", 1156 "fp" : "French Polynesia", 1157 "fr" : "France", 1158 "fs" : "Terres australes et antarctiques françaises", 1159 "ft" : "Djibouti", 1160 "gau" : "Georgia", 1161 "gb" : "Kiribati", 1162 "gd" : "Grenada", 1163 "gg" : "Guernsey", 1164 "gh" : "Ghana", 1165 "gi" : "Gibraltar", 1166 "gl" : "Greenland", 1167 "gm" : "Gambia", 1168 "go" : "Gabon", 1169 "gp" : "Guadeloupe", 1170 "gr" : "Greece", 1171 "gs" : "Georgia (Republic)", 1172 "gt" : "Guatemala", 1173 "gu" : "Guam", 1174 "gv" : "Guinea", 1175 "gw" : "Germany", 1176 "gy" : "Guyana", 1177 "gz" : "Gaza Strip", 1178 "hiu" : "Hawaii", 1179 "hm" : "Heard and McDonald Islands", 1180 "ho" : "Honduras", 1181 "ht" : "Haiti", 1182 "hu" : "Hungary", 1183 "iau" : "Iowa", 1184 "ic" : "Iceland", 1185 "idu" : "Idaho", 1186 "ie" : "Ireland", 1187 "ii" : "India", 1188 "ilu" : "Illinois", 1189 "im" : "Isle of Man", 1190 "inu" : "Indiana", 1191 "io" : "Indonesia", 1192 "iq" : "Iraq", 1193 "ir" : "Iran", 1194 "is" : "Israel", 1195 "it" : "Italy", 1196 "iv" : "Côte d'Ivoire", 1197 "iy" : "Iraq-Saudi Arabia Neutral Zone", 1198 "ja" : "Japan", 1199 "je" : "Jersey", 1200 "ji" : "Johnston Atoll", 1201 "jm" : "Jamaica", 1202 "jo" : "Jordan", 1203 "ke" : "Kenya", 1204 "kg" : "Kyrgyzstan", 1205 "kn" : "Korea (North)", 1206 "ko" : "Korea (South)", 1207 "ksu" : "Kansas", 1208 "ku" : "Kuwait", 1209 "kv" : "Kosovo", 1210 "kyu" : "Kentucky", 1211 "kz" : "Kazakhstan", 1212 "lau" : "Louisiana", 1213 "lb" : "Liberia", 1214 "le" : "Lebanon", 1215 "lh" : "Liechtenstein", 1216 "li" : "Lithuania", 1217 "lo" : "Lesotho", 1218 "ls" : "Laos", 1219 "lu" : "Luxembourg", 1220 "lv" : "Latvia", 1221 "ly" : "Libya", 1222 "mau" : "Massachusetts", 1223 "mbc" : "Manitoba", 1224 "mc" : "Monaco", 1225 "mdu" : "Maryland", 1226 "meu" : "Maine", 1227 "mf" : "Mauritius", 1228 "mg" : "Madagascar", 1229 "miu" : "Michigan", 1230 "mj" : "Montserrat", 1231 "mk" : "Oman", 1232 "ml" : "Mali", 1233 "mm" : "Malta", 1234 "mnu" : "Minnesota", 1235 "mo" : "Montenegro", 1236 "mou" : "Missouri", 1237 "mp" : "Mongolia", 1238 "mq" : "Martinique", 1239 "mr" : "Morocco", 1240 "msu" : "Mississippi", 1241 "mtu" : "Montana", 1242 "mu" : "Mauritania", 1243 "mv" : "Moldova", 1244 "mw" : "Malawi", 1245 "mx" : "Mexico", 1246 "my" : "Malaysia", 1247 "mz" : "Mozambique", 1248 "nbu" : "Nebraska", 1249 "ncu" : "North Carolina", 1250 "ndu" : "North Dakota", 1251 "ne" : "Netherlands", 1252 "nfc" : "Newfoundland and Labrador", 1253 "ng" : "Niger", 1254 "nhu" : "New Hampshire", 1255 "nik" : "Northern Ireland", 1256 "nju" : "New Jersey", 1257 "nkc" : "New Brunswick", 1258 "nl" : "New Caledonia", 1259 "nmu" : "New Mexico", 1260 "nn" : "Vanuatu", 1261 "no" : "Norway", 1262 "np" : "Nepal", 1263 "nq" : "Nicaragua", 1264 "nr" : "Nigeria", 1265 "nsc" : "Nova Scotia", 1266 "ntc" : "Northwest Territories", 1267 "nu" : "Nauru", 1268 "nuc" : "Nunavut", 1269 "nvu" : "Nevada", 1270 "nw" : "Northern Mariana Islands", 1271 "nx" : "Norfolk Island", 1272 "nyu" : "New York (State)", 1273 "nz" : "New Zealand", 1274 "ohu" : "Ohio", 1275 "oku" : "Oklahoma", 1276 "onc" : "Ontario", 1277 "oru" : "Oregon", 1278 "ot" : "Mayotte", 1279 "pau" : "Pennsylvania", 1280 "pc" : "Pitcairn Island", 1281 "pe" : "Peru", 1282 "pf" : "Paracel Islands", 1283 "pg" : "Guinea-Bissau", 1284 "ph" : "Philippines", 1285 "pic" : "Prince Edward Island", 1286 "pk" : "Pakistan", 1287 "pl" : "Poland", 1288 "pn" : "Panama", 1289 "po" : "Portugal", 1290 "pp" : "Papua New Guinea", 1291 "pr" : "Puerto Rico", 1292 "pw" : "Palau", 1293 "py" : "Paraguay", 1294 "qa" : "Qatar", 1295 "qea" : "Queensland", 1296 "quc" : "Québec (Province)", 1297 "rb" : "Serbia", 1298 "re" : "Réunion", 1299 "rh" : "Zimbabwe", 1300 "riu" : "Rhode Island", 1301 "rm" : "Romania", 1302 "ru" : "Russia (Federation)", 1303 "rw" : "Rwanda", 1304 "sa" : "South Africa", 1305 "sc" : "Saint-Barthélemy", 1306 "scu" : "South Carolina", 1307 "sd" : "South Sudan", 1308 "sdu" : "South Dakota", 1309 "se" : "Seychelles", 1310 "sf" : "Sao Tome and Principe", 1311 "sg" : "Senegal", 1312 "sh" : "Spanish North Africa", 1313 "si" : "Singapore", 1314 "sj" : "Sudan", 1315 "sl" : "Sierra Leone", 1316 "sm" : "San Marino", 1317 "sn" : "Sint Maarten", 1318 "snc" : "Saskatchewan", 1319 "so" : "Somalia", 1320 "sp" : "Spain", 1321 "sq" : "Eswatini", 1322 "sr" : "Surinam", 1323 "ss" : "Western Sahara", 1324 "st" : "Saint-Martin", 1325 "stk" : "Scotland", 1326 "su" : "Saudi Arabia", 1327 "sw" : "Sweden", 1328 "sx" : "Namibia", 1329 "sy" : "Syria", 1330 "sz" : "Switzerland", 1331 "ta" : "Tajikistan", 1332 "tc" : "Turks and Caicos Islands", 1333 "tg" : "Togo", 1334 "th" : "Thailand", 1335 "ti" : "Tunisia", 1336 "tk" : "Turkmenistan", 1337 "tl" : "Tokelau", 1338 "tma" : "Tasmania", 1339 "tnu" : "Tennessee", 1340 "to" : "Tonga", 1341 "tr" : "Trinidad and Tobago", 1342 "ts" : "United Arab Emirates", 1343 "tu" : "Turkey", 1344 "tv" : "Tuvalu", 1345 "txu" : "Texas", 1346 "tz" : "Tanzania", 1347 "ua" : "Egypt", 1348 "uc" : "United States Misc. Caribbean Islands", 1349 "ug" : "Uganda", 1350 "un" : "Ukraine", 1351 "up" : "United States Misc. Pacific Islands", 1352 "utu" : "Utah", 1353 "uv" : "Burkina Faso", 1354 "uy" : "Uruguay", 1355 "uz" : "Uzbekistan", 1356 "vau" : "Virginia", 1357 "vb" : "British Virgin Islands", 1358 "vc" : "Vatican City", 1359 "ve" : "Venezuela", 1360 "vi" : "Virgin Islands of the United States", 1361 "vm" : "Vietnam", 1362 "vp" : "Various places", 1363 "vra" : "Victoria", 1364 "vtu" : "Vermont", 1365 "wau" : "Washington (State)", 1366 "wea" : "Western Australia", 1367 "wf" : "Wallis and Futuna", 1368 "wiu" : "Wisconsin", 1369 "wj" : "West Bank of the Jordan River", 1370 "wk" : "Wake Island", 1371 "wlk" : "Wales", 1372 "ws" : "Samoa", 1373 "wvu" : "West Virginia", 1374 "wyu" : "Wyoming", 1375 "xa" : "Christmas Island (Indian Ocean)", 1376 "xb" : "Cocos (Keeling) Islands", 1377 "xc" : "Maldives", 1378 "xd" : "Saint Kitts-Nevis", 1379 "xe" : "Marshall Islands", 1380 "xf" : "Midway Islands", 1381 "xga" : "Coral Sea Islands Territory", 1382 "xh" : "Niue", 1383 "xj" : "Saint Helena", 1384 "xk" : "Saint Lucia", 1385 "xl" : "Saint Pierre and Miquelon", 1386 "xm" : "Saint Vincent and the Grenadines", 1387 "xn" : "North Macedonia", 1388 "xna" : "New South Wales", 1389 "xo" : "Slovakia", 1390 "xoa" : "Northern Territory", 1391 "xp" : "Spratly Island", 1392 "xr" : "Czech Republic", 1393 "xra" : "South Australia", 1394 "xs" : "South Georgia and the South Sandwich Islands", 1395 "xv" : "Slovenia", 1396 "xx" : "No place, unknown, or undetermined", 1397 "xxc" : "Canada", 1398 "xxk" : "United Kingdom", 1399 "xxu" : "United States", 1400 "ye" : "Yemen", 1401 "ykc" : "Yukon Territory", 1402 "za" : "Zambia", 1403 } 1404 MARC_DEPRECATED_COUNTRY_CODES = { 1405 "ac" : "Ashmore and Cartier Islands", 1406 "ai" : "Anguilla", 1407 "air" : "Armenian S.S.R.", 1408 "ajr" : "Azerbaijan S.S.R.", 1409 "bwr" : "Byelorussian S.S.R.", 1410 "cn" : "Canada", 1411 "cp" : "Canton and Enderbury Islands", 1412 "cs" : "Czechoslovakia", 1413 "cz" : "Canal Zone", 1414 "err" : "Estonia", 1415 "ge" : "Germany (East)", 1416 "gn" : "Gilbert and Ellice Islands", 1417 "gsr" : "Georgian S.S.R.", 1418 "hk" : "Hong Kong", 1419 "iu" : "Israel-Syria Demilitarized Zones", 1420 "iw" : "Israel-Jordan Demilitarized Zones", 1421 "jn" : "Jan Mayen", 1422 "kgr" : "Kirghiz S.S.R.", 1423 "kzr" : "Kazakh S.S.R.", 1424 "lir" : "Lithuania", 1425 "ln" : "Central and Southern Line Islands", 1426 "lvr" : "Latvia", 1427 "mh" : "Macao", 1428 "mvr" : "Moldavian S.S.R.", 1429 "na" : "Netherlands Antilles", 1430 "nm" : "Northern Mariana Islands", 1431 "pt" : "Portuguese Timor", 1432 "rur" : "Russian S.F.S.R.", 1433 "ry" : "Ryukyu Islands, Southern", 1434 "sb" : "Svalbard", 1435 "sk" : "Sikkim", 1436 "sv" : "Swan Islands", 1437 "tar" : "Tajik S.S.R.", 1438 "tkr" : "Turkmen S.S.R.", 1439 "tt" : "Trust Territory of the Pacific Islands", 1440 "ui" : "United Kingdom Misc. Islands", 1441 "uik" : "United Kingdom Misc. Islands", 1442 "uk" : "United Kingdom", 1443 "unr" : "Ukraine", 1444 "ur" : "Soviet Union", 1445 "us" : "United States", 1446 "uzr" : "Uzbek S.S.R.", 1447 "vn" : "Vietnam, North", 1448 "vs" : "Vietnam, South", 1449 "wb" : "West Berlin", 1450 "xi" : "Saint Kitts-Nevis-Anguilla", 1451 "xxr" : "Soviet Union", 1452 "ys" : "Yemen (People's Democratic Republic)", 1453 "yu" : "Serbia and Montenegro", 1454 } 1455 1456 1457 worldcat_thread_local = threading.local() 1458 worldcat_line_cache = {} 1459 1460 def set_worldcat_line_cache(parsed_lines): 1461 global worldcat_line_cache 1462 worldcat_line_cache.clear() 1463 for oclc_id, lines in parsed_lines: 1464 worldcat_line_cache[oclc_id] = lines 1465 1466 def get_worldcat_pos_before_id(oclc_id): 1467 oclc_id = int(oclc_id) 1468 1469 file = getattr(worldcat_thread_local, 'file', None) 1470 if file is None: 1471 file = worldcat_thread_local.file = indexed_zstd.IndexedZstdFile('/worldcat/annas_archive_meta__aacid__worldcat__20231001T025039Z--20231001T235839Z.jsonl.seekable.zst') 1472 1473 low = 0 1474 high = file.size() 1475 mid = 0 1476 last_mid = -1 1477 1478 while low < high: 1479 mid = (low+high) // 2 1480 file.seek(mid) 1481 line = file.readline() 1482 if not line.startswith(b'{"aacid":"aacid__worldcat__'): 1483 mid = file.tell() 1484 line = file.readline() 1485 1486 if mid == last_mid: 1487 mid = low 1488 high = low 1489 file.seek(mid) 1490 line = file.readline() 1491 last_mid = mid 1492 1493 # print(line[0:100]) 1494 # print("low", low) 1495 # print("high", high) 1496 # print("mid", mid) 1497 if line == b'': 1498 current_id = 999999999999 1499 else: 1500 current_id = int(line[len(b'{"aacid":"aacid__worldcat__20231001T025039Z__'):].split(b'__', 1)[0]) 1501 if current_id >= oclc_id: 1502 high = mid 1503 else: 1504 low = mid 1505 1506 return mid 1507 1508 def get_worldcat_records(oclc_id): 1509 global worldcat_line_cache 1510 oclc_id = int(oclc_id) 1511 1512 if oclc_id in worldcat_line_cache: 1513 return [orjson.loads(line) for line in worldcat_line_cache[oclc_id]] 1514 # else: 1515 # print(f"Cache miss: {oclc_id}") 1516 1517 pos = get_worldcat_pos_before_id(oclc_id) 1518 file = worldcat_thread_local.file 1519 file.seek(pos) 1520 lines = [] 1521 while True: 1522 line = file.readline() 1523 if line == b'': 1524 current_id = 999999999999 1525 else: 1526 current_id = int(line[len(b'{"aacid":"aacid__worldcat__20231001T025039Z__'):].split(b'__', 1)[0]) 1527 if current_id < oclc_id: 1528 pass 1529 elif current_id == oclc_id: 1530 lines.append(line) 1531 else: 1532 return [orjson.loads(line) for line in lines] 1533 1534 def aa_currently_seeding(metadata): 1535 return ((datetime.datetime.now(datetime.timezone.utc) - datetime.datetime.strptime(metadata['seeding_at'], "%Y-%m-%dT%H:%M:%S%z")) < datetime.timedelta(days=7)) if ('seeding_at' in metadata) else False 1536 1537 @functools.cache 1538 def get_torrents_json_aa_currently_seeding_by_torrent_path(): 1539 with engine.connect() as connection: 1540 connection.connection.ping(reconnect=True) 1541 cursor = connection.connection.cursor(pymysql.cursors.DictCursor) 1542 cursor.execute('SELECT json FROM torrents_json LIMIT 1') 1543 return { row['url'].split('dyn/small_file/torrents/', 1)[1]: row['aa_currently_seeding'] for row in orjson.loads(cursor.fetchone()['json']) } 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561