/ src / utils / error-page-detector.js
error-page-detector.js
  1  /**
  2   * Error Page Detector
  3   * Detects false-positive 200 responses that are actually error pages
  4   * (e.g., HTTP 403 errors rendered as HTML with 200 status code)
  5   */
  6  
  7  import Logger from './logger.js';
  8  
  9  const logger = new Logger('ErrorPageDetector');
 10  
 11  /**
 12   * Error indicators to look for in visible text
 13   */
 14  const ERROR_INDICATORS = [
 15    'HTTP ERROR',
 16    'HTTP Error',
 17    'Permission Denied',
 18    'Permission denied',
 19    'Access Denied',
 20    'Access denied',
 21    "You don't have authorisation to view this page",
 22    "You don't have authorization to view this page",
 23    'You do not have permission',
 24    '403 Forbidden',
 25    'Forbidden',
 26    '401 Unauthorized',
 27    'Unauthorized',
 28    '404 Not Found',
 29    '500 Internal Server Error',
 30    'Internal Server Error',
 31    'Service Unavailable',
 32    '503 Service Unavailable',
 33    'This page is not available',
 34    'Page not available',
 35    'Access to this resource is denied',
 36    'You are not authorized',
 37    'Authentication required',
 38    'Login required',
 39  ];
 40  
 41  /**
 42   * Extract visible text from HTML DOM
 43   * Removes script, style, and hidden elements
 44   * @param {string} html - HTML content
 45   * @returns {string} Visible text content
 46   */
 47  function extractVisibleText(html) {
 48    try {
 49      // Strip non-visible blocks, then all tags, then normalize whitespace
 50      const text = html
 51        .replace(/<script[\s\S]*?<\/script>/gi, '')
 52        .replace(/<style[\s\S]*?<\/style>/gi, '')
 53        .replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
 54        .replace(/<iframe[\s\S]*?<\/iframe>/gi, '')
 55        .replace(/<[^>]+>/g, ' ')
 56        .replace(/&[a-z#0-9]+;/gi, ' ')
 57        .replace(/\s+/g, ' ')
 58        .trim();
 59      return text;
 60    } catch (error) {
 61      logger.error(`Failed to extract visible text: ${error.message}`);
 62      return '';
 63    }
 64  }
 65  
 66  /**
 67   * Count visible words in text
 68   * @param {string} text - Text content
 69   * @returns {number} Word count
 70   */
 71  function countWords(text) {
 72    if (!text || typeof text !== 'string') {
 73      return 0;
 74    }
 75  
 76    // Split on whitespace and filter out empty strings
 77    const words = text.split(/\s+/).filter(word => word.length > 0);
 78    return words.length;
 79  }
 80  
 81  /**
 82   * Check if text contains error indicators
 83   * @param {string} text - Text content
 84   * @returns {Object|null} Error info if detected, null otherwise
 85   */
 86  function detectErrorIndicators(text) {
 87    if (!text || typeof text !== 'string') {
 88      return null;
 89    }
 90  
 91    // Case-insensitive search for error indicators
 92    const lowerText = text.toLowerCase();
 93  
 94    for (const indicator of ERROR_INDICATORS) {
 95      if (lowerText.includes(indicator.toLowerCase())) {
 96        return {
 97          detected: true,
 98          indicator,
 99          message: `Error page detected: "${indicator}"`,
100        };
101      }
102    }
103  
104    return null;
105  }
106  
107  /**
108   * Detect if a page is a false-positive error page
109   * Checks for error indicators when page has less than 200 visible words
110   * @param {string} html - HTML content
111   * @param {number} httpStatusCode - HTTP status code
112   * @returns {Object} Detection result
113   */
114  export function detectErrorPage(html, httpStatusCode) {
115    // Only check pages with 2xx status codes (potential false positives)
116    if (httpStatusCode < 200 || httpStatusCode >= 300) {
117      return {
118        isErrorPage: false,
119        reason: 'Non-2xx status code - not a false positive',
120      };
121    }
122  
123    // Extract visible text
124    const visibleText = extractVisibleText(html);
125    const wordCount = countWords(visibleText);
126  
127    logger.debug(`Visible text word count: ${wordCount}`);
128  
129    // Only check pages with less than 200 words
130    if (wordCount >= 200) {
131      return {
132        isErrorPage: false,
133        reason: `Page has ${wordCount} visible words (>= 200 word threshold)`,
134        wordCount,
135      };
136    }
137  
138    // Check for error indicators
139    const errorDetection = detectErrorIndicators(visibleText);
140  
141    if (errorDetection) {
142      logger.info(
143        `Error page detected: ${errorDetection.indicator} (${wordCount} words, status ${httpStatusCode})`
144      );
145      return {
146        isErrorPage: true,
147        reason: errorDetection.message,
148        indicator: errorDetection.indicator,
149        wordCount,
150        httpStatusCode,
151      };
152    }
153  
154    return {
155      isErrorPage: false,
156      reason: `No error indicators found (${wordCount} words)`,
157      wordCount,
158    };
159  }
160  
161  /**
162   * Phrases that strongly indicate an under-construction or coming-soon page.
163   * Checked case-insensitively against visible text.
164   */
165  const UNDER_CONSTRUCTION_PHRASES = [
166    'under construction',
167    'coming soon',
168    'launching soon',
169    'site coming soon',
170    'website coming soon',
171    'we are coming soon',
172    'we are launching soon',
173    'something exciting is coming',
174    'stay tuned',
175    'we are working on it',
176    "we're working on it",
177    'work in progress',
178    'site is being updated',
179    'website is being updated',
180    'site is currently offline',
181    "we'll be back soon",
182    'we will be back soon',
183    'check back soon',
184    'temporarily unavailable',
185    'down for maintenance',
186    'scheduled maintenance',
187    'currently undergoing maintenance',
188    'be back shortly',
189    'be back soon',
190    'parked domain',
191    'domain is for sale',
192    'this domain is for sale',
193    'buy this domain',
194    'this website is for sale',
195  ];
196  
197  /**
198   * Phrases that are only reliable as under-construction signals when they appear
199   * as standalone text (not inside parentheses labelling a specific item).
200   * e.g. "Graffiti Removal (coming soon)" in a nav menu is NOT an under-construction page.
201   */
202  const PARENTHETICAL_FALSE_POSITIVE_PHRASES = new Set([
203    'coming soon',
204    'launching soon',
205    'stay tuned',
206    'be back soon',
207    'be back shortly',
208  ]);
209  
210  /**
211   * Detect if a page is an under-construction, coming-soon, or parked page.
212   * Unlike detectErrorPage(), this works on pages of any size and does not
213   * require the page to be sparse — construction banners can appear on pages
214   * that otherwise have some content.
215   *
216   * Strips navigation/header/footer elements before checking to avoid false
217   * positives from nav items like "Service Name (coming soon)".
218   *
219   * @param {string} html - HTML content of the page
220   * @param {string} [pageTitle] - Optional <title> text for faster detection
221   * @returns {{ isUnderConstruction: boolean, phrase?: string, reason: string }}
222   */
223  export function detectUnderConstruction(html, pageTitle = '') {
224    // Strip nav/header/footer/menu elements — construction phrases there are false positives
225    const bodyHtml = html
226      .replace(/<nav[\s\S]*?<\/nav>/gi, '')
227      .replace(/<header[\s\S]*?<\/header>/gi, '')
228      .replace(/<footer[\s\S]*?<\/footer>/gi, '')
229      .replace(/<ul[^>]*class="[^"]*menu[^"]*"[\s\S]*?<\/ul>/gi, '');
230  
231    const text = extractVisibleText(bodyHtml);
232    const combined = `${pageTitle} ${text}`.toLowerCase();
233  
234    for (const phrase of UNDER_CONSTRUCTION_PHRASES) {
235      const lPhrase = phrase.toLowerCase();
236      if (!combined.includes(lPhrase)) continue;
237  
238      // For ambiguous phrases, reject if the match is inside parentheses: "(coming soon)"
239      if (PARENTHETICAL_FALSE_POSITIVE_PHRASES.has(lPhrase)) {
240        const idx = combined.indexOf(lPhrase);
241        const before = combined.slice(Math.max(0, idx - 5), idx);
242        if (before.includes('(')) continue; // looks like "(coming soon)"
243      }
244  
245      logger.info(`Under-construction page detected: "${phrase}"`);
246      return { isUnderConstruction: true, phrase, reason: `Page contains "${phrase}"` };
247    }
248  
249    return { isUnderConstruction: false, reason: 'No under-construction indicators found' };
250  }
251  
252  export { extractVisibleText, countWords, ERROR_INDICATORS };
253  
254  export default {
255    detectErrorPage,
256    detectUnderConstruction,
257    extractVisibleText,
258    countWords,
259    ERROR_INDICATORS,
260  };