/ scraping.ipynb
scraping.ipynb
1 { 2 "cells": [ 3 { 4 "cell_type": "code", 5 "execution_count": 1, 6 "metadata": {}, 7 "outputs": [], 8 "source": [ 9 "#!pip3 install undetected-chromedriver\n", 10 "#!pip3 install undetected-chromedriver selenium pandas\n", 11 "#!pip3 install openpyxl\n", 12 "#!pip3 install beautifulsoup4" 13 ] 14 }, 15 { 16 "cell_type": "code", 17 "execution_count": 2, 18 "metadata": {}, 19 "outputs": [], 20 "source": [ 21 "import time\n", 22 "import datetime\n", 23 "import pandas as pd\n", 24 "import undetected_chromedriver as uc\n", 25 "from selenium import webdriver\n", 26 "from selenium.webdriver.common.by import By\n", 27 "from selenium.webdriver.support.ui import WebDriverWait\n", 28 "from selenium.webdriver.support import expected_conditions as EC" 29 ] 30 }, 31 { 32 "cell_type": "code", 33 "execution_count": null, 34 "metadata": {}, 35 "outputs": [ 36 { 37 "name": "stdout", 38 "output_type": "stream", 39 "text": [ 40 "\n", 41 "š Scraping Careers Page...\n", 42 "\n", 43 "\n", 44 "ā No jobs found.\n" 45 ] 46 } 47 ], 48 "source": [ 49 "\n", 50 "\n", 51 "# ---- Function to Scrape LinkedIn Jobs Using Selenium ----\n", 52 "def scrape_linkedin_jobs(keyword, location):\n", 53 " print(\"\\nš Scraping LinkedIn Jobs...\\n\")\n", 54 "\n", 55 " # Configure Selenium WebDriver (Headless Mode)\n", 56 " options = webdriver.ChromeOptions()\n", 57 " options.add_argument(\"--headless\") # Run without opening a browser\n", 58 " options.add_argument(\"--no-sandbox\")\n", 59 " options.add_argument(\"--disable-dev-shm-usage\")\n", 60 "\n", 61 " driver = uc.Chrome(options=options)\n", 62 " \n", 63 " # Generate LinkedIn job search URL\n", 64 " search_url = f\"https://www.linkedin.com/jobs/search?keywords={keyword.replace(' ', '%20')}&location={location.replace(' ', '%20')}\"\n", 65 " driver.get(search_url)\n", 66 " \n", 67 " # ā Scroll to load more jobs\n", 68 " for _ in range(3): \n", 69 " driver.execute_script(\"window.scrollBy(0, 800);\")\n", 70 " time.sleep(2)\n", 71 "\n", 72 " # ā Wait for job listings to appear\n", 73 " wait = WebDriverWait(driver, 15) # Increased wait time\n", 74 " wait.until(EC.presence_of_element_located((By.CLASS_NAME, \"base-card\")))\n", 75 "\n", 76 " jobs = []\n", 77 "\n", 78 " # ā Find all job listings\n", 79 " job_elements = driver.find_elements(By.CLASS_NAME, \"base-card\") \n", 80 "\n", 81 " for job in job_elements[:10]: # Limit to top 10 jobs\n", 82 " try:\n", 83 " title_element = job.find_element(By.CSS_SELECTOR, \"h3\") # Updated selector for job title\n", 84 " title = title_element.text.strip()\n", 85 "\n", 86 " company_element = job.find_element(By.CSS_SELECTOR, \"h4\") # Updated selector for company name\n", 87 " company = company_element.text.strip()\n", 88 "\n", 89 " link = job.find_element(By.TAG_NAME, \"a\").get_attribute(\"href\")\n", 90 "\n", 91 " jobs.append({\"title\": title, \"company\": company, \"link\": link, \"source\": \"LinkedIn\"})\n", 92 " except Exception as e:\n", 93 " print(f\"ā ļø Skipping a job entry due to error: {e}\")\n", 94 " continue\n", 95 "\n", 96 " driver.quit()\n", 97 " return jobs\n", 98 "\n", 99 "# ---- Run the Script and Save to Excel ----\n", 100 "if __name__ == \"__main__\":\n", 101 " keyword = input(\"Enter job title (e.g., Software Engineer): \")\n", 102 " location = input(\"Enter location (e.g., Remote, New York, Berlin): \")\n", 103 "\n", 104 " linkedin_jobs = scrape_linkedin_jobs(keyword, location)\n", 105 "\n", 106 " if linkedin_jobs:\n", 107 " df = pd.DataFrame(linkedin_jobs)\n", 108 "\n", 109 " # ā Save to Excel\n", 110 " today_date = datetime.date.today().strftime(\"%Y-%m-%d\")\n", 111 " filename = f\"linkedin_jobs_{today_date}.xlsx\"\n", 112 " df.to_excel(filename, index=False)\n", 113 " \n", 114 " print(f\"\\nā Jobs saved to {filename}\")\n", 115 " else:\n", 116 " print(\"\\nā No LinkedIn jobs found.\")\n" 117 ] 118 }, 119 { 120 "cell_type": "code", 121 "execution_count": null, 122 "metadata": {}, 123 "outputs": [ 124 { 125 "name": "stdout", 126 "output_type": "stream", 127 "text": [ 128 "[{'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/marketing-internship-at-amazon-4111457309?position=1&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=sEZAo0lK6xh1n0%2BuUSWCmA%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-pr-social-media-at-msm-digital-4121988576?position=2&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=EyLr1MNgmwM7eTzife4mRg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-marketing-at-pulse-advertising-4147607855?position=3&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=2XMBVlHfaCif61ir1QQXzw%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-new-business-development-m-f-x-at-tietalent-4143635138?position=4&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=FPz6HjsyhYfcCZ6P1pzrsg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/corporate-communications-and-events-intern-m-f-d-at-bat-4159358648?position=5&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=IdYmKG048vZobOumaZEyCg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/communications-intern-northvolt-germany-at-northvolt-4121020101?position=6&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=dwLc50Bn25QiAH6PFbNWVQ%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/program-intern-northvolt-germany-at-northvolt-4121017542?position=7&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=a%2B4sry3aOnQDRfHKWy8lbQ%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-finance-accounting-f-m-d-at-mutabor-4125908278?position=8&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=zFA7%2FT0SRn4ZgH3fuQvmdg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/communications-intern-northvolt-germany-at-northvolt-poland-4092459625?position=9&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=W%2B7a6CHDCL1eZJjx0dooNw%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/program-intern-northvolt-germany-at-northvolt-poland-4090630146?position=10&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=Z9%2BvQdDbSyWM8qzzWxsD5Q%3D%3D', 'source': 'LinkedIn'}]\n" 129 ] 130 } 131 ], 132 "source": [ 133 "print(linkedin_jobs)" 134 ] 135 }, 136 { 137 "cell_type": "code", 138 "execution_count": null, 139 "metadata": {}, 140 "outputs": [], 141 "source": [] 142 } 143 ], 144 "metadata": { 145 "kernelspec": { 146 "display_name": "CV_R", 147 "language": "python", 148 "name": "python3" 149 }, 150 "language_info": { 151 "codemirror_mode": { 152 "name": "ipython", 153 "version": 3 154 }, 155 "file_extension": ".py", 156 "mimetype": "text/x-python", 157 "name": "python", 158 "nbconvert_exporter": "python", 159 "pygments_lexer": "ipython3", 160 "version": "3.11.11" 161 } 162 }, 163 "nbformat": 4, 164 "nbformat_minor": 2 165 }