error-page-detector.js
1 /** 2 * Error Page Detector 3 * Detects false-positive 200 responses that are actually error pages 4 * (e.g., HTTP 403 errors rendered as HTML with 200 status code) 5 */ 6 7 import Logger from './logger.js'; 8 9 const logger = new Logger('ErrorPageDetector'); 10 11 /** 12 * Error indicators to look for in visible text 13 */ 14 const ERROR_INDICATORS = [ 15 'HTTP ERROR', 16 'HTTP Error', 17 'Permission Denied', 18 'Permission denied', 19 'Access Denied', 20 'Access denied', 21 "You don't have authorisation to view this page", 22 "You don't have authorization to view this page", 23 'You do not have permission', 24 '403 Forbidden', 25 'Forbidden', 26 '401 Unauthorized', 27 'Unauthorized', 28 '404 Not Found', 29 '500 Internal Server Error', 30 'Internal Server Error', 31 'Service Unavailable', 32 '503 Service Unavailable', 33 'This page is not available', 34 'Page not available', 35 'Access to this resource is denied', 36 'You are not authorized', 37 'Authentication required', 38 'Login required', 39 ]; 40 41 /** 42 * Extract visible text from HTML DOM 43 * Removes script, style, and hidden elements 44 * @param {string} html - HTML content 45 * @returns {string} Visible text content 46 */ 47 function extractVisibleText(html) { 48 try { 49 // Strip non-visible blocks, then all tags, then normalize whitespace 50 const text = html 51 .replace(/<script[\s\S]*?<\/script>/gi, '') 52 .replace(/<style[\s\S]*?<\/style>/gi, '') 53 .replace(/<noscript[\s\S]*?<\/noscript>/gi, '') 54 .replace(/<iframe[\s\S]*?<\/iframe>/gi, '') 55 .replace(/<[^>]+>/g, ' ') 56 .replace(/&[a-z#0-9]+;/gi, ' ') 57 .replace(/\s+/g, ' ') 58 .trim(); 59 return text; 60 } catch (error) { 61 logger.error(`Failed to extract visible text: ${error.message}`); 62 return ''; 63 } 64 } 65 66 /** 67 * Count visible words in text 68 * @param {string} text - Text content 69 * @returns {number} Word count 70 */ 71 function countWords(text) { 72 if (!text || typeof text !== 'string') { 73 return 0; 74 } 75 76 // Split on whitespace and filter out empty strings 77 const words = text.split(/\s+/).filter(word => word.length > 0); 78 return words.length; 79 } 80 81 /** 82 * Check if text contains error indicators 83 * @param {string} text - Text content 84 * @returns {Object|null} Error info if detected, null otherwise 85 */ 86 function detectErrorIndicators(text) { 87 if (!text || typeof text !== 'string') { 88 return null; 89 } 90 91 // Case-insensitive search for error indicators 92 const lowerText = text.toLowerCase(); 93 94 for (const indicator of ERROR_INDICATORS) { 95 if (lowerText.includes(indicator.toLowerCase())) { 96 return { 97 detected: true, 98 indicator, 99 message: `Error page detected: "${indicator}"`, 100 }; 101 } 102 } 103 104 return null; 105 } 106 107 /** 108 * Detect if a page is a false-positive error page 109 * Checks for error indicators when page has less than 200 visible words 110 * @param {string} html - HTML content 111 * @param {number} httpStatusCode - HTTP status code 112 * @returns {Object} Detection result 113 */ 114 export function detectErrorPage(html, httpStatusCode) { 115 // Only check pages with 2xx status codes (potential false positives) 116 if (httpStatusCode < 200 || httpStatusCode >= 300) { 117 return { 118 isErrorPage: false, 119 reason: 'Non-2xx status code - not a false positive', 120 }; 121 } 122 123 // Extract visible text 124 const visibleText = extractVisibleText(html); 125 const wordCount = countWords(visibleText); 126 127 logger.debug(`Visible text word count: ${wordCount}`); 128 129 // Only check pages with less than 200 words 130 if (wordCount >= 200) { 131 return { 132 isErrorPage: false, 133 reason: `Page has ${wordCount} visible words (>= 200 word threshold)`, 134 wordCount, 135 }; 136 } 137 138 // Check for error indicators 139 const errorDetection = detectErrorIndicators(visibleText); 140 141 if (errorDetection) { 142 logger.info( 143 `Error page detected: ${errorDetection.indicator} (${wordCount} words, status ${httpStatusCode})` 144 ); 145 return { 146 isErrorPage: true, 147 reason: errorDetection.message, 148 indicator: errorDetection.indicator, 149 wordCount, 150 httpStatusCode, 151 }; 152 } 153 154 return { 155 isErrorPage: false, 156 reason: `No error indicators found (${wordCount} words)`, 157 wordCount, 158 }; 159 } 160 161 /** 162 * Phrases that strongly indicate an under-construction or coming-soon page. 163 * Checked case-insensitively against visible text. 164 */ 165 const UNDER_CONSTRUCTION_PHRASES = [ 166 'under construction', 167 'coming soon', 168 'launching soon', 169 'site coming soon', 170 'website coming soon', 171 'we are coming soon', 172 'we are launching soon', 173 'something exciting is coming', 174 'stay tuned', 175 'we are working on it', 176 "we're working on it", 177 'work in progress', 178 'site is being updated', 179 'website is being updated', 180 'site is currently offline', 181 "we'll be back soon", 182 'we will be back soon', 183 'check back soon', 184 'temporarily unavailable', 185 'down for maintenance', 186 'scheduled maintenance', 187 'currently undergoing maintenance', 188 'be back shortly', 189 'be back soon', 190 'parked domain', 191 'domain is for sale', 192 'this domain is for sale', 193 'buy this domain', 194 'this website is for sale', 195 ]; 196 197 /** 198 * Phrases that are only reliable as under-construction signals when they appear 199 * as standalone text (not inside parentheses labelling a specific item). 200 * e.g. "Graffiti Removal (coming soon)" in a nav menu is NOT an under-construction page. 201 */ 202 const PARENTHETICAL_FALSE_POSITIVE_PHRASES = new Set([ 203 'coming soon', 204 'launching soon', 205 'stay tuned', 206 'be back soon', 207 'be back shortly', 208 ]); 209 210 /** 211 * Detect if a page is an under-construction, coming-soon, or parked page. 212 * Unlike detectErrorPage(), this works on pages of any size and does not 213 * require the page to be sparse — construction banners can appear on pages 214 * that otherwise have some content. 215 * 216 * Strips navigation/header/footer elements before checking to avoid false 217 * positives from nav items like "Service Name (coming soon)". 218 * 219 * @param {string} html - HTML content of the page 220 * @param {string} [pageTitle] - Optional <title> text for faster detection 221 * @returns {{ isUnderConstruction: boolean, phrase?: string, reason: string }} 222 */ 223 export function detectUnderConstruction(html, pageTitle = '') { 224 // Strip nav/header/footer/menu elements — construction phrases there are false positives 225 const bodyHtml = html 226 .replace(/<nav[\s\S]*?<\/nav>/gi, '') 227 .replace(/<header[\s\S]*?<\/header>/gi, '') 228 .replace(/<footer[\s\S]*?<\/footer>/gi, '') 229 .replace(/<ul[^>]*class="[^"]*menu[^"]*"[\s\S]*?<\/ul>/gi, ''); 230 231 const text = extractVisibleText(bodyHtml); 232 const combined = `${pageTitle} ${text}`.toLowerCase(); 233 234 for (const phrase of UNDER_CONSTRUCTION_PHRASES) { 235 const lPhrase = phrase.toLowerCase(); 236 if (!combined.includes(lPhrase)) continue; 237 238 // For ambiguous phrases, reject if the match is inside parentheses: "(coming soon)" 239 if (PARENTHETICAL_FALSE_POSITIVE_PHRASES.has(lPhrase)) { 240 const idx = combined.indexOf(lPhrase); 241 const before = combined.slice(Math.max(0, idx - 5), idx); 242 if (before.includes('(')) continue; // looks like "(coming soon)" 243 } 244 245 logger.info(`Under-construction page detected: "${phrase}"`); 246 return { isUnderConstruction: true, phrase, reason: `Page contains "${phrase}"` }; 247 } 248 249 return { isUnderConstruction: false, reason: 'No under-construction indicators found' }; 250 } 251 252 export { extractVisibleText, countWords, ERROR_INDICATORS }; 253 254 export default { 255 detectErrorPage, 256 detectUnderConstruction, 257 extractVisibleText, 258 countWords, 259 ERROR_INDICATORS, 260 };