/ scraping.ipynb
scraping.ipynb
  1  {
  2   "cells": [
  3    {
  4     "cell_type": "code",
  5     "execution_count": 1,
  6     "metadata": {},
  7     "outputs": [],
  8     "source": [
  9      "#!pip3 install undetected-chromedriver\n",
 10      "#!pip3 install undetected-chromedriver selenium pandas\n",
 11      "#!pip3 install openpyxl\n",
 12      "#!pip3 install beautifulsoup4"
 13     ]
 14    },
 15    {
 16     "cell_type": "code",
 17     "execution_count": 2,
 18     "metadata": {},
 19     "outputs": [],
 20     "source": [
 21      "import time\n",
 22      "import datetime\n",
 23      "import pandas as pd\n",
 24      "import undetected_chromedriver as uc\n",
 25      "from selenium import webdriver\n",
 26      "from selenium.webdriver.common.by import By\n",
 27      "from selenium.webdriver.support.ui import WebDriverWait\n",
 28      "from selenium.webdriver.support import expected_conditions as EC"
 29     ]
 30    },
 31    {
 32     "cell_type": "code",
 33     "execution_count": null,
 34     "metadata": {},
 35     "outputs": [
 36      {
 37       "name": "stdout",
 38       "output_type": "stream",
 39       "text": [
 40        "\n",
 41        "šŸ” Scraping Careers Page...\n",
 42        "\n",
 43        "\n",
 44        "āŒ No jobs found.\n"
 45       ]
 46      }
 47     ],
 48     "source": [
 49      "\n",
 50      "\n",
 51      "# ---- Function to Scrape LinkedIn Jobs Using Selenium ----\n",
 52      "def scrape_linkedin_jobs(keyword, location):\n",
 53      "    print(\"\\nšŸ” Scraping LinkedIn Jobs...\\n\")\n",
 54      "\n",
 55      "    # Configure Selenium WebDriver (Headless Mode)\n",
 56      "    options = webdriver.ChromeOptions()\n",
 57      "    options.add_argument(\"--headless\")  # Run without opening a browser\n",
 58      "    options.add_argument(\"--no-sandbox\")\n",
 59      "    options.add_argument(\"--disable-dev-shm-usage\")\n",
 60      "\n",
 61      "    driver = uc.Chrome(options=options)\n",
 62      "    \n",
 63      "    # Generate LinkedIn job search URL\n",
 64      "    search_url = f\"https://www.linkedin.com/jobs/search?keywords={keyword.replace(' ', '%20')}&location={location.replace(' ', '%20')}\"\n",
 65      "    driver.get(search_url)\n",
 66      "    \n",
 67      "    # āœ… Scroll to load more jobs\n",
 68      "    for _ in range(3):  \n",
 69      "        driver.execute_script(\"window.scrollBy(0, 800);\")\n",
 70      "        time.sleep(2)\n",
 71      "\n",
 72      "    # āœ… Wait for job listings to appear\n",
 73      "    wait = WebDriverWait(driver, 15)  # Increased wait time\n",
 74      "    wait.until(EC.presence_of_element_located((By.CLASS_NAME, \"base-card\")))\n",
 75      "\n",
 76      "    jobs = []\n",
 77      "\n",
 78      "    # āœ… Find all job listings\n",
 79      "    job_elements = driver.find_elements(By.CLASS_NAME, \"base-card\")  \n",
 80      "\n",
 81      "    for job in job_elements[:10]:  # Limit to top 10 jobs\n",
 82      "        try:\n",
 83      "            title_element = job.find_element(By.CSS_SELECTOR, \"h3\")  # Updated selector for job title\n",
 84      "            title = title_element.text.strip()\n",
 85      "\n",
 86      "            company_element = job.find_element(By.CSS_SELECTOR, \"h4\")  # Updated selector for company name\n",
 87      "            company = company_element.text.strip()\n",
 88      "\n",
 89      "            link = job.find_element(By.TAG_NAME, \"a\").get_attribute(\"href\")\n",
 90      "\n",
 91      "            jobs.append({\"title\": title, \"company\": company, \"link\": link, \"source\": \"LinkedIn\"})\n",
 92      "        except Exception as e:\n",
 93      "            print(f\"āš ļø Skipping a job entry due to error: {e}\")\n",
 94      "            continue\n",
 95      "\n",
 96      "    driver.quit()\n",
 97      "    return jobs\n",
 98      "\n",
 99      "# ---- Run the Script and Save to Excel ----\n",
100      "if __name__ == \"__main__\":\n",
101      "    keyword = input(\"Enter job title (e.g., Software Engineer): \")\n",
102      "    location = input(\"Enter location (e.g., Remote, New York, Berlin): \")\n",
103      "\n",
104      "    linkedin_jobs = scrape_linkedin_jobs(keyword, location)\n",
105      "\n",
106      "    if linkedin_jobs:\n",
107      "        df = pd.DataFrame(linkedin_jobs)\n",
108      "\n",
109      "        # āœ… Save to Excel\n",
110      "        today_date = datetime.date.today().strftime(\"%Y-%m-%d\")\n",
111      "        filename = f\"linkedin_jobs_{today_date}.xlsx\"\n",
112      "        df.to_excel(filename, index=False)\n",
113      "        \n",
114      "        print(f\"\\nāœ… Jobs saved to {filename}\")\n",
115      "    else:\n",
116      "        print(\"\\nāŒ No LinkedIn jobs found.\")\n"
117     ]
118    },
119    {
120     "cell_type": "code",
121     "execution_count": null,
122     "metadata": {},
123     "outputs": [
124      {
125       "name": "stdout",
126       "output_type": "stream",
127       "text": [
128        "[{'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/marketing-internship-at-amazon-4111457309?position=1&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=sEZAo0lK6xh1n0%2BuUSWCmA%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-pr-social-media-at-msm-digital-4121988576?position=2&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=EyLr1MNgmwM7eTzife4mRg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-marketing-at-pulse-advertising-4147607855?position=3&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=2XMBVlHfaCif61ir1QQXzw%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-new-business-development-m-f-x-at-tietalent-4143635138?position=4&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=FPz6HjsyhYfcCZ6P1pzrsg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/corporate-communications-and-events-intern-m-f-d-at-bat-4159358648?position=5&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=IdYmKG048vZobOumaZEyCg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/communications-intern-northvolt-germany-at-northvolt-4121020101?position=6&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=dwLc50Bn25QiAH6PFbNWVQ%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/program-intern-northvolt-germany-at-northvolt-4121017542?position=7&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=a%2B4sry3aOnQDRfHKWy8lbQ%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/internship-finance-accounting-f-m-d-at-mutabor-4125908278?position=8&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=zFA7%2FT0SRn4ZgH3fuQvmdg%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/communications-intern-northvolt-germany-at-northvolt-poland-4092459625?position=9&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=W%2B7a6CHDCL1eZJjx0dooNw%3D%3D', 'source': 'LinkedIn'}, {'title': '', 'company': '', 'link': 'https://de.linkedin.com/jobs/view/program-intern-northvolt-germany-at-northvolt-poland-4090630146?position=10&pageNum=0&refId=3rcmXj%2F5XTlGX9JqFduCTw%3D%3D&trackingId=Z9%2BvQdDbSyWM8qzzWxsD5Q%3D%3D', 'source': 'LinkedIn'}]\n"
129       ]
130      }
131     ],
132     "source": [
133      "print(linkedin_jobs)"
134     ]
135    },
136    {
137     "cell_type": "code",
138     "execution_count": null,
139     "metadata": {},
140     "outputs": [],
141     "source": []
142    }
143   ],
144   "metadata": {
145    "kernelspec": {
146     "display_name": "CV_R",
147     "language": "python",
148     "name": "python3"
149    },
150    "language_info": {
151     "codemirror_mode": {
152      "name": "ipython",
153      "version": 3
154     },
155     "file_extension": ".py",
156     "mimetype": "text/x-python",
157     "name": "python",
158     "nbconvert_exporter": "python",
159     "pygments_lexer": "ipython3",
160     "version": "3.11.11"
161    }
162   },
163   "nbformat": 4,
164   "nbformat_minor": 2
165  }