Cradicle Explorer

/ .github / scripts / position-description-ingester.js
position-description-ingester.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Position Description Ingester
  5   * 
  6   * Analyzes job descriptions to extract key requirements, skills, and context
  7   * for tailoring CV content to specific opportunities. This enables the AI
  8   * enhancement system to emphasize relevant experience and skills.
  9   * 
 10   * Features:
 11   * - Parse job descriptions from various formats (text, URL, PDF)
 12   * - Extract key skills, technologies, and requirements
 13   * - Analyze company culture and values alignment
 14   * - Generate targeting insights for CV customization
 15   * - Store processed job descriptions for reuse
 16   * 
 17   * Usage: 
 18   *   node position-description-ingester.js --text "job description text"
 19   *   node position-description-ingester.js --url "https://company.com/job"
 20   *   node position-description-ingester.js --file "job-description.pdf"
 21   * 
 22   * @author Adrian Wedd
 23   * @version 1.0.0
 24   */
 25  
 26  const fs = require('fs').promises;
 27  const path = require('path');
 28  const crypto = require('crypto');
 29  
 30  class PositionDescriptionIngester {
 31      constructor() {
 32          this.dataDir = path.resolve(__dirname, '../../data');
 33          this.positionsDir = path.join(this.dataDir, 'positions');
 34          this.outputDir = path.join(this.dataDir, 'targeting');
 35          
 36          // Skill categories for classification
 37          this.skillCategories = {
 38              'programming': [
 39                  'python', 'javascript', 'typescript', 'java', 'c++', 'c#', 'rust', 'go', 'php',
 40                  'ruby', 'swift', 'kotlin', 'scala', 'r', 'matlab', 'sql', 'html', 'css'
 41              ],
 42              'frameworks': [
 43                  'react', 'vue', 'angular', 'django', 'flask', 'spring', 'express', 'fastapi',
 44                  'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy', 'bootstrap'
 45              ],
 46              'cloud_platforms': [
 47                  'aws', 'azure', 'gcp', 'google cloud', 'digitalocean', 'heroku', 'vercel'
 48              ],
 49              'devops': [
 50                  'docker', 'kubernetes', 'jenkins', 'github actions', 'ci/cd', 'terraform',
 51                  'ansible', 'puppet', 'chef', 'vagrant', 'monitoring', 'logging'
 52              ],
 53              'databases': [
 54                  'postgresql', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'cassandra',
 55                  'dynamodb', 'sqlite', 'oracle', 'sql server'
 56              ],
 57              'ai_ml': [
 58                  'machine learning', 'artificial intelligence', 'deep learning', 'nlp',
 59                  'computer vision', 'data science', 'neural networks', 'transformers',
 60                  'llm', 'generative ai', 'gpt', 'claude', 'openai'
 61              ],
 62              'systems': [
 63                  'linux', 'unix', 'windows server', 'networking', 'security', 'cybersecurity',
 64                  'system administration', 'infrastructure', 'performance tuning'
 65              ],
 66              'methodologies': [
 67                  'agile', 'scrum', 'kanban', 'devops', 'microservices', 'api design',
 68                  'system design', 'architecture', 'testing', 'tdd', 'bdd'
 69              ]
 70          };
 71          
 72          // Experience level indicators
 73          this.experienceLevels = {
 74              'entry': ['junior', 'entry', 'graduate', 'associate', '0-2 years', 'new grad'],
 75              'mid': ['mid', 'intermediate', '2-5 years', '3-7 years', 'experienced'],
 76              'senior': ['senior', 'lead', 'principal', '5+ years', '7+ years', 'expert'],
 77              'management': ['manager', 'director', 'head of', 'vp', 'chief', 'team lead']
 78          };
 79          
 80          // Company culture indicators
 81          this.cultureIndicators = {
 82              'innovation': ['innovative', 'cutting-edge', 'pioneering', 'disruptive', 'startup'],
 83              'growth': ['scaling', 'growing', 'expanding', 'fast-paced', 'dynamic'],
 84              'collaboration': ['team', 'collaborative', 'cross-functional', 'partnership'],
 85              'impact': ['mission', 'purpose', 'social impact', 'meaningful', 'change'],
 86              'stability': ['established', 'stable', 'mature', 'enterprise', 'fortune']
 87          };
 88      }
 89  
 90      /**
 91       * Initialize the ingestion system
 92       */
 93      async initialize() {
 94          console.log('🎯 Initializing Position Description Ingester...');
 95          
 96          // Create directories if they don't exist
 97          await fs.mkdir(this.positionsDir, { recursive: true });
 98          await fs.mkdir(this.outputDir, { recursive: true });
 99          
100          console.log('✅ Directory structure initialized');
101      }
102  
103      /**
104       * Ingest job description from text
105       */
106      async ingestFromText(jobText, metadata = {}) {
107          console.log('📝 Processing job description from text...');
108          
109          const analysis = await this.analyzeJobDescription(jobText, metadata);
110          const jobId = this.generateJobId(jobText, metadata);
111          
112          await this.saveJobAnalysis(jobId, analysis);
113          await this.generateTargetingInsights(jobId, analysis);
114          
115          console.log(`✅ Job analysis completed: ${jobId}`);
116          return { jobId, analysis };
117      }
118      
119      /**
120       * Ingest job description from URL
121       */
122      async ingestFromUrl(url) {
123          console.log(`🌐 Processing job description from URL: ${url}`);
124          
125          try {
126              // For now, this is a placeholder for web scraping functionality
127              // In the future, this could use Playwright or similar tools
128              console.log('⚠️ URL ingestion requires web scraping implementation');
129              console.log('💡 For now, please copy the job description text and use --text option');
130              
131              return { jobId: null, analysis: null, error: 'URL ingestion not yet implemented' };
132          } catch (error) {
133              console.error('❌ Failed to ingest from URL:', error.message);
134              return { jobId: null, analysis: null, error: error.message };
135          }
136      }
137      
138      /**
139       * Ingest job description from file
140       */
141      async ingestFromFile(filePath) {
142          console.log(`📄 Processing job description from file: ${filePath}`);
143          
144          try {
145              const content = await fs.readFile(filePath, 'utf8');
146              const metadata = {
147                  source: 'file',
148                  file_path: filePath,
149                  file_name: path.basename(filePath)
150              };
151              
152              return await this.ingestFromText(content, metadata);
153          } catch (error) {
154              console.error('❌ Failed to read file:', error.message);
155              return { jobId: null, analysis: null, error: error.message };
156          }
157      }
158  
159      /**
160       * Analyze job description content
161       */
162      async analyzeJobDescription(text, metadata = {}) {
163          const analysis = {
164              metadata: {
165                  analyzed_at: new Date().toISOString(),
166                  source: metadata.source || 'text',
167                  ingester_version: '1.0.0',
168                  ...metadata
169              },
170              raw_text: text,
171              extracted_data: {},
172              targeting_insights: {}
173          };
174  
175          // Extract basic information
176          analysis.extracted_data = {
177              job_title: this.extractJobTitle(text),
178              company: this.extractCompany(text),
179              location: this.extractLocation(text),
180              experience_level: this.classifyExperienceLevel(text),
181              employment_type: this.extractEmploymentType(text),
182              salary_range: this.extractSalaryRange(text)
183          };
184  
185          // Extract skills and technologies
186          analysis.extracted_data.required_skills = this.extractSkills(text, 'required');
187          analysis.extracted_data.preferred_skills = this.extractSkills(text, 'preferred');
188          analysis.extracted_data.technology_stack = this.categorizeSkills(
189              [...analysis.extracted_data.required_skills, ...analysis.extracted_data.preferred_skills]
190          );
191  
192          // Extract responsibilities and requirements
193          analysis.extracted_data.key_responsibilities = this.extractResponsibilities(text);
194          analysis.extracted_data.qualifications = this.extractQualifications(text);
195          
196          // Analyze company culture
197          analysis.extracted_data.culture_indicators = this.analyzeCulture(text);
198          
199          // Generate matching insights
200          analysis.targeting_insights = await this.generateMatchingInsights(analysis.extracted_data);
201  
202          return analysis;
203      }
204  
205      /**
206       * Extract job title from text
207       */
208      extractJobTitle(text) {
209          // Look for common job title patterns
210          const titlePatterns = [
211              /(?:position|role|job|title):\s*([^\n]+)/i,
212              /^([^\n]+(?:engineer|developer|analyst|manager|director|specialist|coordinator))/i,
213              /job title:\s*([^\n]+)/i
214          ];
215  
216          for (const pattern of titlePatterns) {
217              const match = text.match(pattern);
218              if (match) {
219                  return match[1].trim();
220              }
221          }
222  
223          // Fallback: look for capitalized words that might be job titles
224          const lines = text.split('\n').slice(0, 5); // Check first few lines
225          for (const line of lines) {
226              if (line.length < 100 && /[A-Z]/.test(line) && 
227                  (line.includes('Engineer') || line.includes('Developer') || 
228                   line.includes('Analyst') || line.includes('Manager'))) {
229                  return line.trim();
230              }
231          }
232  
233          return 'Unknown Position';
234      }
235  
236      /**
237       * Extract company name from text
238       */
239      extractCompany(text) {
240          const companyPatterns = [
241              /company:\s*([^\n]+)/i,
242              /employer:\s*([^\n]+)/i,
243              /organization:\s*([^\n]+)/i,
244              /(?:at|join)\s+([A-Z][a-zA-Z\s&]+?)(?:\s+is|,|\.|$)/
245          ];
246  
247          for (const pattern of companyPatterns) {
248              const match = text.match(pattern);
249              if (match) {
250                  return match[1].trim();
251              }
252          }
253  
254          return 'Unknown Company';
255      }
256  
257      /**
258       * Extract location from text
259       */
260      extractLocation(text) {
261          const locationPatterns = [
262              /location:\s*([^\n]+)/i,
263              /based in:\s*([^\n]+)/i,
264              /(?:remote|hybrid|onsite).*?([A-Z][a-z]+,\s*[A-Z]{2,})/,
265              /([A-Z][a-z]+,\s*(?:Australia|USA|UK|Canada|Germany))/i
266          ];
267  
268          for (const pattern of locationPatterns) {
269              const match = text.match(pattern);
270              if (match) {
271                  return match[1].trim();
272              }
273          }
274  
275          return 'Location not specified';
276      }
277  
278      /**
279       * Extract employment type from text
280       */
281      extractEmploymentType(text) {
282          const lowerText = text.toLowerCase();
283          
284          if (lowerText.includes('full-time') || lowerText.includes('full time')) {
285              return 'full-time';
286          } else if (lowerText.includes('part-time') || lowerText.includes('part time')) {
287              return 'part-time';
288          } else if (lowerText.includes('contract') || lowerText.includes('contractor')) {
289              return 'contract';
290          } else if (lowerText.includes('freelance') || lowerText.includes('consultant')) {
291              return 'freelance';
292          } else if (lowerText.includes('intern') || lowerText.includes('internship')) {
293              return 'internship';
294          }
295          
296          return 'not specified';
297      }
298  
299      /**
300       * Extract salary range from text
301       */
302      extractSalaryRange(text) {
303          const salaryPatterns = [
304              /\$[\d,]+\s*-\s*\$[\d,]+/g,
305              /salary:\s*([^\n]+)/i,
306              /compensation:\s*([^\n]+)/i,
307              /[\d,]+k?\s*-\s*[\d,]+k?\s*(?:AUD|USD|per year|annually)/i
308          ];
309  
310          for (const pattern of salaryPatterns) {
311              const match = text.match(pattern);
312              if (match) {
313                  return match[0].trim();
314              }
315          }
316  
317          return 'not specified';
318      }
319  
320      /**
321       * Classify experience level
322       */
323      classifyExperienceLevel(text) {
324          const lowerText = text.toLowerCase();
325          
326          for (const [level, indicators] of Object.entries(this.experienceLevels)) {
327              for (const indicator of indicators) {
328                  if (lowerText.includes(indicator)) {
329                      return level;
330                  }
331              }
332          }
333          
334          return 'unspecified';
335      }
336  
337      /**
338       * Extract skills from text
339       */
340      extractSkills(text, _type = 'all') {
341          const skills = new Set();
342          const lowerText = text.toLowerCase();
343          
344          // Check all skill categories
345          for (const [_category, skillList] of Object.entries(this.skillCategories)) {
346              for (const skill of skillList) {
347                  if (lowerText.includes(skill.toLowerCase())) {
348                      skills.add(skill);
349                  }
350              }
351          }
352          
353          return Array.from(skills);
354      }
355  
356      /**
357       * Categorize skills by type
358       */
359      categorizeSkills(skills) {
360          const categorized = {};
361          
362          for (const [category, skillList] of Object.entries(this.skillCategories)) {
363              categorized[category] = skills.filter(skill => 
364                  skillList.some(s => s.toLowerCase() === skill.toLowerCase())
365              );
366          }
367          
368          return categorized;
369      }
370  
371      /**
372       * Extract key responsibilities
373       */
374      extractResponsibilities(text) {
375          const responsibilities = [];
376          const lines = text.split('\n');
377          
378          let inResponsibilities = false;
379          for (const line of lines) {
380              const trimmed = line.trim();
381              
382              if (/responsibilities|duties|role includes/i.test(trimmed)) {
383                  inResponsibilities = true;
384                  continue;
385              }
386              
387              if (inResponsibilities) {
388                  if (/requirements|qualifications|skills/i.test(trimmed)) {
389                      break;
390                  }
391                  
392                  if (trimmed.match(/^[-•*]\s+(.+)/) || trimmed.match(/^\d+\.\s+(.+)/)) {
393                      responsibilities.push(trimmed.replace(/^[-•*]\s*/, '').replace(/^\d+\.\s*/, ''));
394                  }
395              }
396          }
397          
398          return responsibilities;
399      }
400  
401      /**
402       * Extract qualifications
403       */
404      extractQualifications(text) {
405          const qualifications = [];
406          const lines = text.split('\n');
407          
408          let inQualifications = false;
409          for (const line of lines) {
410              const trimmed = line.trim();
411              
412              if (/requirements|qualifications|must have|essential/i.test(trimmed)) {
413                  inQualifications = true;
414                  continue;
415              }
416              
417              if (inQualifications) {
418                  if (/benefits|compensation|about us/i.test(trimmed)) {
419                      break;
420                  }
421                  
422                  if (trimmed.match(/^[-•*]\s+(.+)/) || trimmed.match(/^\d+\.\s+(.+)/)) {
423                      qualifications.push(trimmed.replace(/^[-•*]\s*/, '').replace(/^\d+\.\s*/, ''));
424                  }
425              }
426          }
427          
428          return qualifications;
429      }
430  
431      /**
432       * Analyze company culture indicators
433       */
434      analyzeCulture(text) {
435          const cultureScore = {};
436          const lowerText = text.toLowerCase();
437          
438          for (const [culture, indicators] of Object.entries(this.cultureIndicators)) {
439              let score = 0;
440              for (const indicator of indicators) {
441                  if (lowerText.includes(indicator)) {
442                      score++;
443                  }
444              }
445              cultureScore[culture] = score;
446          }
447          
448          // Find dominant culture traits
449          const sortedCulture = Object.entries(cultureScore)
450              .sort(([,a], [,b]) => b - a)
451              .filter(([,score]) => score > 0);
452              
453          return {
454              scores: cultureScore,
455              dominant_traits: sortedCulture.slice(0, 3).map(([trait]) => trait)
456          };
457      }
458  
459      /**
460       * Generate matching insights
461       */
462      async generateMatchingInsights(extractedData) {
463          // Load current CV data for comparison
464          let cvData = {};
465          try {
466              const cvPath = path.join(this.dataDir, 'base-cv.json');
467              const content = await fs.readFile(cvPath, 'utf8');
468              cvData = JSON.parse(content);
469          } catch {
470              console.warn('⚠️ Could not load CV data for matching analysis');
471          }
472  
473          const insights = {
474              skill_matches: this.analyzeSkillMatches(extractedData, cvData),
475              experience_alignment: this.analyzeExperienceAlignment(extractedData, cvData),
476              culture_fit: this.analyzeCultureFit(extractedData, cvData),
477              enhancement_recommendations: this.generateEnhancementRecommendations(extractedData, cvData)
478          };
479  
480          return insights;
481      }
482  
483      /**
484       * Analyze skill matches between job and CV
485       */
486      analyzeSkillMatches(jobData, cvData) {
487          const cvSkills = new Set();
488          
489          // Extract skills from CV
490          if (cvData.skills) {
491              cvData.skills.forEach(skill => cvSkills.add(skill.name.toLowerCase()));
492          }
493          
494          const requiredMatches = jobData.required_skills.filter(skill => 
495              cvSkills.has(skill.toLowerCase())
496          );
497          
498          const preferredMatches = jobData.preferred_skills.filter(skill => 
499              cvSkills.has(skill.toLowerCase())
500          );
501          
502          const missingRequired = jobData.required_skills.filter(skill => 
503              !cvSkills.has(skill.toLowerCase())
504          );
505  
506          return {
507              required_matches: requiredMatches,
508              preferred_matches: preferredMatches,
509              missing_required: missingRequired,
510              match_percentage: requiredMatches.length / Math.max(jobData.required_skills.length, 1) * 100
511          };
512      }
513  
514      /**
515       * Analyze experience alignment
516       */
517      analyzeExperienceAlignment(jobData, cvData) {
518          const alignment = {
519              level_match: false,
520              relevant_experience: [],
521              transferable_skills: []
522          };
523  
524          // Check experience level alignment
525          if (cvData.experience) {
526              const totalYears = cvData.experience.reduce((total, exp) => {
527                  const years = this.calculateYears(exp.period);
528                  return total + years;
529              }, 0);
530  
531              alignment.level_match = this.matchesExperienceLevel(jobData.experience_level, totalYears);
532              alignment.total_years = totalYears;
533          }
534  
535          return alignment;
536      }
537  
538      /**
539       * Analyze culture fit
540       */
541      analyzeCultureFit(jobData, _cvData) {
542          const cultureFit = {
543              alignment_score: 0,
544              matching_values: [],
545              recommendations: []
546          };
547  
548          // This is a simplified culture analysis
549          // In practice, this would involve more sophisticated matching
550          if (jobData.culture_indicators?.dominant_traits) {
551              cultureFit.identified_culture = jobData.culture_indicators.dominant_traits;
552          }
553  
554          return cultureFit;
555      }
556  
557      /**
558       * Generate enhancement recommendations
559       */
560      generateEnhancementRecommendations(jobData, _cvData) {
561          const recommendations = [];
562  
563          // Skill gap recommendations
564          if (jobData.required_skills) {
565              recommendations.push({
566                  type: 'skills',
567                  priority: 'high',
568                  action: 'Emphasize matching skills in professional summary',
569                  skills: jobData.required_skills.slice(0, 5)
570              });
571          }
572  
573          // Experience emphasis
574          recommendations.push({
575              type: 'experience',
576              priority: 'medium',
577              action: 'Highlight relevant experience that matches job responsibilities',
578              focus_areas: jobData.key_responsibilities?.slice(0, 3) || []
579          });
580  
581          return recommendations;
582      }
583  
584      /**
585       * Generate unique job ID
586       */
587      generateJobId(text, metadata) {
588          const content = text + JSON.stringify(metadata);
589          const hash = crypto.createHash('md5').update(content).digest('hex');
590          const timestamp = new Date().toISOString().slice(0, 10);
591          return `job-${timestamp}-${hash.slice(0, 8)}`;
592      }
593  
594      /**
595       * Save job analysis results
596       */
597      async saveJobAnalysis(jobId, analysis) {
598          const filePath = path.join(this.positionsDir, `${jobId}.json`);
599          await fs.writeFile(filePath, JSON.stringify(analysis, null, 2), 'utf8');
600          console.log(`💾 Job analysis saved: ${filePath}`);
601      }
602  
603      /**
604       * Generate targeting insights file
605       */
606      async generateTargetingInsights(jobId, analysis) {
607          const insights = {
608              job_id: jobId,
609              generated_at: new Date().toISOString(),
610              targeting_profile: {
611                  position: analysis.extracted_data.job_title,
612                  company: analysis.extracted_data.company,
613                  key_skills: analysis.extracted_data.required_skills.slice(0, 10),
614                  experience_level: analysis.extracted_data.experience_level,
615                  culture_traits: analysis.extracted_data.culture_indicators?.dominant_traits || []
616              },
617              cv_customization: analysis.targeting_insights
618          };
619  
620          const insightsPath = path.join(this.outputDir, `targeting-${jobId}.json`);
621          await fs.writeFile(insightsPath, JSON.stringify(insights, null, 2), 'utf8');
622          
623          // Also save as latest targeting insights
624          const latestPath = path.join(this.outputDir, 'latest-targeting.json');
625          await fs.writeFile(latestPath, JSON.stringify(insights, null, 2), 'utf8');
626          
627          console.log(`🎯 Targeting insights saved: ${insightsPath}`);
628      }
629  
630      /**
631       * Helper methods
632       */
633      calculateYears(period) {
634          // Simple year calculation - in practice this would be more sophisticated
635          const yearMatch = period.match(/(\d{4})\s*-\s*(\d{4}|Present)/);
636          if (yearMatch) {
637              const start = parseInt(yearMatch[1]);
638              const end = yearMatch[2] === 'Present' ? new Date().getFullYear() : parseInt(yearMatch[2]);
639              return end - start;
640          }
641          return 0;
642      }
643  
644      matchesExperienceLevel(jobLevel, totalYears) {
645          const levelRanges = {
646              'entry': [0, 2],
647              'mid': [2, 7],
648              'senior': [7, Infinity],
649              'management': [5, Infinity]
650          };
651  
652          if (levelRanges[jobLevel]) {
653              const [min, max] = levelRanges[jobLevel];
654              return totalYears >= min && totalYears <= max;
655          }
656  
657          return false;
658      }
659  }
660  
661  // CLI interface
662  async function main() {
663      const ingester = new PositionDescriptionIngester();
664      await ingester.initialize();
665      
666      const args = process.argv.slice(2);
667      
668      if (args.includes('--text')) {
669          const textIndex = args.indexOf('--text');
670          const jobText = args[textIndex + 1];
671          if (!jobText) {
672              console.error('❌ Please provide job description text after --text');
673              process.exit(1);
674          }
675          
676          const result = await ingester.ingestFromText(jobText, { source: 'cli_text' });
677          console.log(`\n✅ Processing complete. Job ID: ${result.jobId}`);
678          
679      } else if (args.includes('--url')) {
680          const urlIndex = args.indexOf('--url');
681          const url = args[urlIndex + 1];
682          if (!url) {
683              console.error('❌ Please provide URL after --url');
684              process.exit(1);
685          }
686          
687          const result = await ingester.ingestFromUrl(url);
688          if (result.error) {
689              console.error(`❌ ${result.error}`);
690              process.exit(1);
691          }
692          
693      } else if (args.includes('--file')) {
694          const fileIndex = args.indexOf('--file');
695          const filePath = args[fileIndex + 1];
696          if (!filePath) {
697              console.error('❌ Please provide file path after --file');
698              process.exit(1);
699          }
700          
701          const result = await ingester.ingestFromFile(filePath);
702          if (result.error) {
703              console.error(`❌ ${result.error}`);
704              process.exit(1);
705          }
706          console.log(`\n✅ Processing complete. Job ID: ${result.jobId}`);
707          
708      } else {
709          console.log('Position Description Ingester');
710          console.log('');
711          console.log('Usage:');
712          console.log('  node position-description-ingester.js --text "job description text"');
713          console.log('  node position-description-ingester.js --url "https://company.com/job"');
714          console.log('  node position-description-ingester.js --file "job-description.txt"');
715          console.log('');
716          console.log('The ingester will analyze the job description and generate targeting');
717          console.log('insights to help customize your CV for the specific opportunity.');
718      }
719  }
720  
721  if (require.main === module) {
722      main().catch(console.error);
723  }
724  
725  module.exports = PositionDescriptionIngester;