Module indeed_scraping
Expand source code
import numpy as np
import time
import json
import pandas as pd
import re
import requests
import datetime
import os
import psycopg2
from random import randrange
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
offer_age = 1
def scrap_offer_indeed(list_keyword, offer_age, indeed_country, headless=True):
"""Scrap indeed offers using chrome browser.
Args:
list_keyword: word that will be searched in indeed.
offer_age: age of offer in days.
indeed_country: 2-length string country to be used in url.
headless: whether to open a chrome browser or not, default set to True.
Returns:
soup_list: list of html strings to be parsed.
Example:
This example will scrap offers from the last 3 days of web developper, data analyst in the United Kingdom.
scrap_offer_indeed(["web developper", "data analyst"], 3, "uk", False)
"""
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("window-size=1024,768")
chrome_options.add_argument("--no-sandbox")
if headless:
chrome_options.add_argument("--headless")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
s = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=chrome_options)
cookies_clicked_already = False
soup_list = []
for keyword in list_keyword:
url_base = f"https://{indeed_country}.indeed.com"
url_param = f"/jobs?q={keyword}&fromage={offer_age}"
while url_param:
url = url_base + url_param
driver.get(url)
html_text = driver.page_source
soup = BeautifulSoup(html_text, 'html.parser')
if not cookies_clicked_already:
for button in soup.find_all("button"):
if "All Cookies" in button.text or "les cookies" in button.text:
break
if button.attrs.get("id"):
button_class = button.attrs["id"]
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'button[id="{button_class}"]'))).click()
seconds = 3 + randrange(5, 11)/10
time.sleep(seconds)
cookies_clicked_already = True
html_text = driver.page_source
soup = BeautifulSoup(html_text, 'html.parser')
param = soup.find("nav", {"role":"navigation"}).find("a", {"aria-label":"Next Page"})
if param:
url_param = param["href"]
else:
url_param = None
soup_list.append(soup)
driver.close()
return soup_list
def parse_indeed_soup(soup_list, indeed_country):
"""Parse the indeed offers to readable pandas DataFrame.
Args:
soup_list: list of html strings to be parsed.
indeed_country: 2-length string country to be used in url
Returns:
df_jobs: panda DataFrame with an indeed offer and overall information.
db_tags: panda DataFrame with job tags such as contract type, salary and more.
"""
id_list = []
job_title_list = []
company_name_list = []
job_location_list = []
description_snippet_list = []
rating_list = []
scraped_at_list = []
url_list = []
list_dict_tag = []
for s in soup_list:
soup_beacon = s.find_all("div", {"class":"job_seen_beacon"})
for soup in soup_beacon:
id_ = soup.a["data-jk"]
job_title = soup.find("span", {"title":True})
company_name = soup.find("span", {"class":"companyName"})
job_location = soup.find("div", {"class":"companyLocation"})
rating = soup.find("span", {"class":"ratingNumber"})
scraped_at = datetime.datetime.now().replace(microsecond=0)
url = f'https://{indeed_country}.indeed.com/viewjob?jk={id_}'
id_list.append(id_)
url_list.append(url)
if job_title:
job_title_list.append(job_title.text)
else:
job_title_list.append(None)
if company_name:
company_name_list.append(company_name.text)
else:
company_name_list.append(None)
if job_location:
job_location_list.append(job_location.text)
else:
job_location_list.append(None)
if rating:
rating_list.append(rating.text)
else:
rating_list.append(None)
scraped_at_list.append(scraped_at)
# Job tags
soup_tag = soup.find_all("div", {"class":"attribute_snippet"})
for ele in soup_tag:
list_dict_tag.append(
{
"id_job":id_,
"job_tag":ele.text.replace('\xa0', ' ')
}
)
description_snippet = soup.find("div", {"class":"job-snippet"})
if description_snippet:
description_snippet_list.append(description_snippet.text.strip().replace(u'\xa0', u' '))
else:
description_snippet_list.append(None)
dict_jobs = {
"id":id_list,
"job_title":job_title_list,
"company_name":company_name_list,
"job_location":job_location_list,
"description_snippet":description_snippet_list,
"company_rating":rating_list,
"scraped_at":scraped_at_list,
"url":url_list
}
df_jobs = pd.DataFrame(dict_jobs)
df_jobs = df_jobs.drop_duplicates(subset='id')
df_jobs["company_rating"] = df_jobs["company_rating"].str.replace(",",".").astype("float")
df_tags = pd.DataFrame(list_dict_tag)
df_tags = df_tags.drop_duplicates(subset=['id_job', 'job_tag'])
return df_jobs, df_tags
def insert_and_update_table(df_jobs, df_tags):
"""Insert new rows and update existing rows to the jobs and tags table given a .json connection file.
Args:
df_jobs: panda DataFrame with an indeed offer and overall information.
db_tags: panda DataFrame with job tags such as contract type, salary and more.
"""
with open("postgresql_credentials.json") as f:
creds = json.load(f)
conn_string = f'postgresql://{creds["username"]}:{creds["password"]}@{creds["host"]}/{creds["database"]}'
engine = create_engine(conn_string)
with engine.begin() as conn_alchemy:
df_jobs.to_sql('jobs_to_add', conn_alchemy, if_exists='replace')
df_tags.to_sql('tags_to_add', conn_alchemy, if_exists='replace')
sql_insert_and_update_jobs = """
INSERT INTO jobs (id, job_title, company_name, description_snippet, company_rating, scraped_at, url)
SELECT
new.id,
new.job_title,
new.company_name,
new.description_snippet,
new.company_rating,
new.scraped_at,
new.url
FROM jobs_to_add AS new
LEFT JOIN jobs ON jobs.id = new.id
WHERE jobs.id IS NULL;
WITH jobs_to_update AS (
SELECT
new.id,
new.job_title,
new.company_name,
new.description_snippet,
new.company_rating,
new.scraped_at
FROM jobs_to_add AS new
INNER JOIN jobs ON jobs.id = new.id
)
UPDATE jobs
SET
job_title = jobs_to_update.job_title,
company_name = jobs_to_update.company_name,
description_snippet = jobs_to_update.description_snippet,
company_rating = jobs_to_update.company_rating
FROM jobs_to_update
WHERE jobs.id = jobs_to_update.id;
DROP TABLE jobs_to_add;
"""
sql_insert_tags = """
INSERT INTO tags (id_job, job_tag)
SELECT
new.id_job,
new.job_tag
FROM tags_to_add AS new
LEFT JOIN tags ON tags.id_job = new.id_job
WHERE tags.id_job IS NULL;
DROP TABLE tags_to_add;
"""
conn = psycopg2.connect(
dbname=creds["database"],
user=creds["username"],
password=creds["password"],
host=creds["host"]
)
with conn:
with conn.cursor() as cur:
cur.execute(sql_insert_and_update_jobs)
cur.execute(sql_insert_tags)
def get_id_and_url_indeed_description_to_scrap(n_offers):
"""Scrap latest full description of indeed offers.
Args:
n_offers: number of description to be scraped.
Returns:
id_list: list of id of the indeed offers.
url_list: list of url of the indeed offers.
Example:
Get the 10 latest id and url of the indeed offers.
get_id_and_url_indeed_description_to_scrap(1O)
"""
with open("postgresql_credentials.json") as f:
creds = json.load(f)
conn_string = f'postgresql://{creds["username"]}:{creds["password"]}@{creds["host"]}/{creds["database"]}'
engine = create_engine(conn_string)
sql_query = f"""
SELECT id, url FROM jobs
WHERE description_verbose IS NULL
ORDER BY updated_at DESC
LIMIT {n_offers}
"""
with engine.begin() as conn_alchemy:
df = pd.read_sql(sql_query, conn_alchemy)
id_list = df["id"].to_list()
url_list = df["url"].to_list()
return id_list, url_list
def scrap_indeed_description(id_offers_to_scrap, url_offers_to_scrap, headless=True):
"""Scrap indeed description using chrome browser.
Args:
id_offers_to_scrap: word that will be searched in indeed.
url_offers_to_scrap: age of offer in days.
headless: whether to open a chrome browser or not, default set to True.
Returns:
df_description: pandas DataFrame id, url and the verbose description of an offer.
Example:
This example will scrap the full description of those 2 offers id, note the fr country in the url.
scrap_indeed_description(
['24beaf6498294e77', '60c4fa9e88ed2de9'],
['https://fr.indeed.com/viewjob?jk=24beaf6498294e77','https://fr.indeed.com/viewjob?jk=60c4fa9e88ed2de9'],
False)
"""
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("window-size=1024,768")
chrome_options.add_argument("--no-sandbox")
if headless:
chrome_options.add_argument("--headless")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
s = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=chrome_options)
description_verbose_list = []
cookies_clicked_already = False
for url in url_offers_to_scrap:
driver.get(url)
html_text = driver.page_source
soup = BeautifulSoup(html_text, 'html.parser')
if not cookies_clicked_already:
button = None
for button in soup.find_all("button"):
if "All Cookies" in button.text or "les cookies" in button.text:
break
if button:
if button.attrs.get("id"):
button_class = button.attrs["id"]
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'button[id="{button_class}"]'))).click()
seconds = 3 + randrange(5, 11)/10
time.sleep(seconds)
cookies_clicked_already = True
soup_description = soup.find_all("div",{"id":"jobDescriptionText"})
for ele in soup_description:
if ele:
description_verbose_list.append(ele.text.replace(u'\xa0', u' '))
if not ele:
description_verbose_list.append(None)
seconds = 3 + randrange(5, 11)/10
time.sleep(seconds)
driver.close()
dict_description = {
"id":id_offers_to_scrap,
"url":url_offers_to_scrap,
"description_verbose":description_verbose_list
}
df_description = pd.DataFrame(dict_description)
return df_description
def update_indeed_description_verbose(df_description):
"""Update description_verbose rows in the jobs table given a .json connection file.
Args:
df_jobs: panda DataFrame with an indeed offer and overall information.
db_tags: panda DataFrame with job tags such as contract type, salary and more.
"""
with open("postgresql_credentials.json") as f:
creds = json.load(f)
conn_string = f'postgresql://{creds["username"]}:{creds["password"]}@{creds["host"]}/{creds["database"]}'
engine = create_engine(conn_string)
with engine.begin() as conn_alchemy:
df_description.to_sql('description_verbose_to_update', conn_alchemy, if_exists='replace')
conn = psycopg2.connect(
dbname=creds["database"],
user=creds["username"],
password=creds["password"],
host=creds["host"]
)
with conn:
with conn.cursor() as cur:
sql_update_description_verbose = """
UPDATE jobs
SET description_verbose = d.description_verbose
FROM description_verbose_to_update AS d
WHERE jobs.id = d.id;
DROP TABLE description_verbose_to_update;
"""
cur.execute(sql_update_description_verbose)
if __name__ == '__main__':
offer_age = 1
list_country = ['fr', 'ca', 'uk', 'au']
# for indeed_country in list_country:
# if indeed_country == 'fr':
# list_keyword = ['data', 'développeur web', 'designer UX/UI']
# else:
# list_keyword = ['data', 'web developper', 'UX/UI designer']
# soup_list = scrap_offer_indeed(list_keyword, offer_age, indeed_country, False)
# df_jobs, df_tags = parse_indeed_soup(soup_list, indeed_country)
# insert_and_update_table(df_jobs, df_tags)
# for i in range(130):
# id_offers_to_scrap, url_offers_to_scrap = get_id_and_url_indeed_description_to_scrap(15)
# if id_offers_to_scrap:
# df_description = scrap_indeed_description(id_offers_to_scrap, url_offers_to_scrap, False)
# update_indeed_description_verbose(df_description)
# else:
# break
Functions
def get_id_and_url_indeed_description_to_scrap(n_offers)-
Scrap latest full description of indeed offers.
Args
n_offers- number of description to be scraped.
Returns
id_list- list of id of the indeed offers.
url_list- list of url of the indeed offers.
Example
Get the 10 latest id and url of the indeed offers.
get_id_and_url_indeed_description_to_scrap(1O)
Expand source code
def get_id_and_url_indeed_description_to_scrap(n_offers): """Scrap latest full description of indeed offers. Args: n_offers: number of description to be scraped. Returns: id_list: list of id of the indeed offers. url_list: list of url of the indeed offers. Example: Get the 10 latest id and url of the indeed offers. get_id_and_url_indeed_description_to_scrap(1O) """ with open("postgresql_credentials.json") as f: creds = json.load(f) conn_string = f'postgresql://{creds["username"]}:{creds["password"]}@{creds["host"]}/{creds["database"]}' engine = create_engine(conn_string) sql_query = f""" SELECT id, url FROM jobs WHERE description_verbose IS NULL ORDER BY updated_at DESC LIMIT {n_offers} """ with engine.begin() as conn_alchemy: df = pd.read_sql(sql_query, conn_alchemy) id_list = df["id"].to_list() url_list = df["url"].to_list() return id_list, url_list def insert_and_update_table(df_jobs, df_tags)-
Insert new rows and update existing rows to the jobs and tags table given a .json connection file.
Args
df_jobs- panda DataFrame with an indeed offer and overall information.
db_tags- panda DataFrame with job tags such as contract type, salary and more.
Expand source code
def insert_and_update_table(df_jobs, df_tags): """Insert new rows and update existing rows to the jobs and tags table given a .json connection file. Args: df_jobs: panda DataFrame with an indeed offer and overall information. db_tags: panda DataFrame with job tags such as contract type, salary and more. """ with open("postgresql_credentials.json") as f: creds = json.load(f) conn_string = f'postgresql://{creds["username"]}:{creds["password"]}@{creds["host"]}/{creds["database"]}' engine = create_engine(conn_string) with engine.begin() as conn_alchemy: df_jobs.to_sql('jobs_to_add', conn_alchemy, if_exists='replace') df_tags.to_sql('tags_to_add', conn_alchemy, if_exists='replace') sql_insert_and_update_jobs = """ INSERT INTO jobs (id, job_title, company_name, description_snippet, company_rating, scraped_at, url) SELECT new.id, new.job_title, new.company_name, new.description_snippet, new.company_rating, new.scraped_at, new.url FROM jobs_to_add AS new LEFT JOIN jobs ON jobs.id = new.id WHERE jobs.id IS NULL; WITH jobs_to_update AS ( SELECT new.id, new.job_title, new.company_name, new.description_snippet, new.company_rating, new.scraped_at FROM jobs_to_add AS new INNER JOIN jobs ON jobs.id = new.id ) UPDATE jobs SET job_title = jobs_to_update.job_title, company_name = jobs_to_update.company_name, description_snippet = jobs_to_update.description_snippet, company_rating = jobs_to_update.company_rating FROM jobs_to_update WHERE jobs.id = jobs_to_update.id; DROP TABLE jobs_to_add; """ sql_insert_tags = """ INSERT INTO tags (id_job, job_tag) SELECT new.id_job, new.job_tag FROM tags_to_add AS new LEFT JOIN tags ON tags.id_job = new.id_job WHERE tags.id_job IS NULL; DROP TABLE tags_to_add; """ conn = psycopg2.connect( dbname=creds["database"], user=creds["username"], password=creds["password"], host=creds["host"] ) with conn: with conn.cursor() as cur: cur.execute(sql_insert_and_update_jobs) cur.execute(sql_insert_tags) def parse_indeed_soup(soup_list, indeed_country)-
Parse the indeed offers to readable pandas DataFrame.
Args
soup_list- list of html strings to be parsed.
indeed_country- 2-length string country to be used in url
Returns
df_jobs- panda DataFrame with an indeed offer and overall information.
db_tags- panda DataFrame with job tags such as contract type, salary and more.
Expand source code
def parse_indeed_soup(soup_list, indeed_country): """Parse the indeed offers to readable pandas DataFrame. Args: soup_list: list of html strings to be parsed. indeed_country: 2-length string country to be used in url Returns: df_jobs: panda DataFrame with an indeed offer and overall information. db_tags: panda DataFrame with job tags such as contract type, salary and more. """ id_list = [] job_title_list = [] company_name_list = [] job_location_list = [] description_snippet_list = [] rating_list = [] scraped_at_list = [] url_list = [] list_dict_tag = [] for s in soup_list: soup_beacon = s.find_all("div", {"class":"job_seen_beacon"}) for soup in soup_beacon: id_ = soup.a["data-jk"] job_title = soup.find("span", {"title":True}) company_name = soup.find("span", {"class":"companyName"}) job_location = soup.find("div", {"class":"companyLocation"}) rating = soup.find("span", {"class":"ratingNumber"}) scraped_at = datetime.datetime.now().replace(microsecond=0) url = f'https://{indeed_country}.indeed.com/viewjob?jk={id_}' id_list.append(id_) url_list.append(url) if job_title: job_title_list.append(job_title.text) else: job_title_list.append(None) if company_name: company_name_list.append(company_name.text) else: company_name_list.append(None) if job_location: job_location_list.append(job_location.text) else: job_location_list.append(None) if rating: rating_list.append(rating.text) else: rating_list.append(None) scraped_at_list.append(scraped_at) # Job tags soup_tag = soup.find_all("div", {"class":"attribute_snippet"}) for ele in soup_tag: list_dict_tag.append( { "id_job":id_, "job_tag":ele.text.replace('\xa0', ' ') } ) description_snippet = soup.find("div", {"class":"job-snippet"}) if description_snippet: description_snippet_list.append(description_snippet.text.strip().replace(u'\xa0', u' ')) else: description_snippet_list.append(None) dict_jobs = { "id":id_list, "job_title":job_title_list, "company_name":company_name_list, "job_location":job_location_list, "description_snippet":description_snippet_list, "company_rating":rating_list, "scraped_at":scraped_at_list, "url":url_list } df_jobs = pd.DataFrame(dict_jobs) df_jobs = df_jobs.drop_duplicates(subset='id') df_jobs["company_rating"] = df_jobs["company_rating"].str.replace(",",".").astype("float") df_tags = pd.DataFrame(list_dict_tag) df_tags = df_tags.drop_duplicates(subset=['id_job', 'job_tag']) return df_jobs, df_tags def scrap_indeed_description(id_offers_to_scrap, url_offers_to_scrap, headless=True)-
Scrap indeed description using chrome browser.
Args
id_offers_to_scrap- word that will be searched in indeed.
url_offers_to_scrap- age of offer in days.
headless- whether to open a chrome browser or not, default set to True.
Returns
df_description- pandas DataFrame id, url and the verbose description of an offer.
Example
This example will scrap the full description of those 2 offers id, note the fr country in the url.
scrap_indeed_description( ['24beaf6498294e77', '60c4fa9e88ed2de9'], ['https://fr.indeed.com/viewjob?jk=24beaf6498294e77','https://fr.indeed.com/viewjob?jk=60c4fa9e88ed2de9'], False)
Expand source code
def scrap_indeed_description(id_offers_to_scrap, url_offers_to_scrap, headless=True): """Scrap indeed description using chrome browser. Args: id_offers_to_scrap: word that will be searched in indeed. url_offers_to_scrap: age of offer in days. headless: whether to open a chrome browser or not, default set to True. Returns: df_description: pandas DataFrame id, url and the verbose description of an offer. Example: This example will scrap the full description of those 2 offers id, note the fr country in the url. scrap_indeed_description( ['24beaf6498294e77', '60c4fa9e88ed2de9'], ['https://fr.indeed.com/viewjob?jk=24beaf6498294e77','https://fr.indeed.com/viewjob?jk=60c4fa9e88ed2de9'], False) """ chrome_options = Options() chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("window-size=1024,768") chrome_options.add_argument("--no-sandbox") if headless: chrome_options.add_argument("--headless") user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' chrome_options.add_argument(f'user-agent={user_agent}') s = ChromeService(ChromeDriverManager().install()) driver = webdriver.Chrome(service=s, options=chrome_options) description_verbose_list = [] cookies_clicked_already = False for url in url_offers_to_scrap: driver.get(url) html_text = driver.page_source soup = BeautifulSoup(html_text, 'html.parser') if not cookies_clicked_already: button = None for button in soup.find_all("button"): if "All Cookies" in button.text or "les cookies" in button.text: break if button: if button.attrs.get("id"): button_class = button.attrs["id"] WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'button[id="{button_class}"]'))).click() seconds = 3 + randrange(5, 11)/10 time.sleep(seconds) cookies_clicked_already = True soup_description = soup.find_all("div",{"id":"jobDescriptionText"}) for ele in soup_description: if ele: description_verbose_list.append(ele.text.replace(u'\xa0', u' ')) if not ele: description_verbose_list.append(None) seconds = 3 + randrange(5, 11)/10 time.sleep(seconds) driver.close() dict_description = { "id":id_offers_to_scrap, "url":url_offers_to_scrap, "description_verbose":description_verbose_list } df_description = pd.DataFrame(dict_description) return df_description def scrap_offer_indeed(list_keyword, offer_age, indeed_country, headless=True)-
Scrap indeed offers using chrome browser.
Args
list_keyword- word that will be searched in indeed.
offer_age- age of offer in days.
indeed_country- 2-length string country to be used in url.
headless- whether to open a chrome browser or not, default set to True.
Returns
soup_list- list of html strings to be parsed.
Example
This example will scrap offers from the last 3 days of web developper, data analyst in the United Kingdom.
scrap_offer_indeed(["web developper", "data analyst"], 3, "uk", False)
Expand source code
def scrap_offer_indeed(list_keyword, offer_age, indeed_country, headless=True): """Scrap indeed offers using chrome browser. Args: list_keyword: word that will be searched in indeed. offer_age: age of offer in days. indeed_country: 2-length string country to be used in url. headless: whether to open a chrome browser or not, default set to True. Returns: soup_list: list of html strings to be parsed. Example: This example will scrap offers from the last 3 days of web developper, data analyst in the United Kingdom. scrap_offer_indeed(["web developper", "data analyst"], 3, "uk", False) """ chrome_options = Options() chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("window-size=1024,768") chrome_options.add_argument("--no-sandbox") if headless: chrome_options.add_argument("--headless") user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' chrome_options.add_argument(f'user-agent={user_agent}') s = ChromeService(ChromeDriverManager().install()) driver = webdriver.Chrome(service=s, options=chrome_options) cookies_clicked_already = False soup_list = [] for keyword in list_keyword: url_base = f"https://{indeed_country}.indeed.com" url_param = f"/jobs?q={keyword}&fromage={offer_age}" while url_param: url = url_base + url_param driver.get(url) html_text = driver.page_source soup = BeautifulSoup(html_text, 'html.parser') if not cookies_clicked_already: for button in soup.find_all("button"): if "All Cookies" in button.text or "les cookies" in button.text: break if button.attrs.get("id"): button_class = button.attrs["id"] WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'button[id="{button_class}"]'))).click() seconds = 3 + randrange(5, 11)/10 time.sleep(seconds) cookies_clicked_already = True html_text = driver.page_source soup = BeautifulSoup(html_text, 'html.parser') param = soup.find("nav", {"role":"navigation"}).find("a", {"aria-label":"Next Page"}) if param: url_param = param["href"] else: url_param = None soup_list.append(soup) driver.close() return soup_list def update_indeed_description_verbose(df_description)-
Update description_verbose rows in the jobs table given a .json connection file.
Args
df_jobs- panda DataFrame with an indeed offer and overall information.
db_tags- panda DataFrame with job tags such as contract type, salary and more.
Expand source code
def update_indeed_description_verbose(df_description): """Update description_verbose rows in the jobs table given a .json connection file. Args: df_jobs: panda DataFrame with an indeed offer and overall information. db_tags: panda DataFrame with job tags such as contract type, salary and more. """ with open("postgresql_credentials.json") as f: creds = json.load(f) conn_string = f'postgresql://{creds["username"]}:{creds["password"]}@{creds["host"]}/{creds["database"]}' engine = create_engine(conn_string) with engine.begin() as conn_alchemy: df_description.to_sql('description_verbose_to_update', conn_alchemy, if_exists='replace') conn = psycopg2.connect( dbname=creds["database"], user=creds["username"], password=creds["password"], host=creds["host"] ) with conn: with conn.cursor() as cur: sql_update_description_verbose = """ UPDATE jobs SET description_verbose = d.description_verbose FROM description_verbose_to_update AS d WHERE jobs.id = d.id; DROP TABLE description_verbose_to_update; """ cur.execute(sql_update_description_verbose)