Python

Here is an example of some Python code I wrote to open a webpage, scrape it for text and links, go through 600 pages of results, and then output the data into Google Firebase Console.

from bs4 import BeautifulSoup

from selenium import webdrivers

from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions

from selenium.webdriver.support.expected_conditions import presence_of_element_located

from selenium.webdriver.support.expected_conditions import element_to_be_clickable

from selenium.webdriver.support.expected_conditions import url_changes

from selenium.webdriver.common.action_chains import *

from decimal import Decimal

from Product import Product

from selenium.common.exceptions import TimeoutException

import re

import time

from selenium.common.exceptions import ElementNotVisibleException

from selenium.common.exceptions import StaleElementReferenceException

from selenium.common.exceptions import *

from fractions import Fraction

import sys

import codecs

def scrape_FestivalFoods_search_results(list_of_products, product):

# Code taken from here: https://stackoverflow.com/questions/59787776/how-to-set-chrome-experimental-option-same-site-by-default-cookie-in-python-sele

# Code taken from here: https://www.selenium.dev/documentation/en/

# for the special accented characters in the product names

if sys.stdout.encoding != 'UTF-8':

sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')

if sys.stderr.encoding != 'UTF-8':

sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')

chrome_options = webdriver.ChromeOptions()

experimentalFlags = ['same-site-by-default-cookies@1','cookies-without-same-site-must-be-secure@1']

chromeLocalStatePrefs = { 'browser.enabled_labs_experiments' : experimentalFlags}

chrome_options.add_experimental_option('localState',chromeLocalStatePrefs)

driver = webdriver.Chrome(options=chrome_options, executable_path=r'chromedriver.exe')

driver.get("https://www.festfoods.com/shop#!/?q={}".format(product))

#driver.get("https://www.festfoods.com/my-store/store-locator")

price_line = re.compile('\$[0-9]+\.[0-9][0-9]$')

size_letters_and_numbers = re.compile('\S*(\S*([a-zA-Z]\S*[0-9])|([0-9]\S*[a-zA-Z]))\S*')

trailing_periods = re.compile('\.(?!\d)')

containing_letters = re.compile('\S*([a-zA-Z])')

wait = WebDriverWait(driver, 30)

#print("I have just declared the wait and am now about to wait until the first div on the page is present")

element_present = expected_conditions.presence_of_all_elements_located((By.XPATH, "/html/body/div[1]"))

try:

wait.until(element_present);

#print("found the page contents")

except TimeoutException:

print("Could not scan FestivalFoods for:", end = " ")

print("the first div on the page")

print(product)

print("\n")

return

driver.close()

wait.until(expected_conditions.presence_of_all_elements_located((By.XPATH, "/html/body/div[1]/div/header/div/nav/section[1]/div/div/div[5]/div/span[3]/a[1]")));

sign_in = driver.find_element_by_xpath("/html/body/div[1]/div/header/div/nav/section[1]/div/div/div[5]/div/span[3]/a[1]")

driver.execute_script("arguments[0].click();", sign_in) #click sign in

wait.until(expected_conditions.presence_of_all_elements_located((By.XPATH, "/html/body/div[1]/div/main/div/div[2]/article/ul/li/div/div/div[3]/div/div/form/div[1]/div[2]/label/input")));

username = driver.find_element_by_xpath("/html/body/div[1]/div/main/div/div[2]/article/ul/li/div/div/div[3]/div/div/form/div[1]/div[2]/label/input")

username.send_keys('xyoshqx@hotmail.com')

password = driver.find_element_by_xpath("/html/body/div[1]/div/main/div/div[2]/article/ul/li/div/div/div[3]/div/div/form/div[1]/div[3]/label[1]/input")

password.send_keys('festivalfoods')

sign_in2 = driver.find_element_by_xpath("/html/body/div[1]/div/main/div/div[2]/article/ul/li/div/div/div[3]/div/div/form/div[2]/button")

driver.execute_script("arguments[0].click();", sign_in2)

try:

# lxml is a parser. soup now will contain all of the html in the page.

soup = BeautifulSoup(driver.page_source, 'lxml')

store_cards = soup.find_all('div', class_='all-stores-info col-md-4 col-sm-12')

for store in store_cards:

if store.find('h3').getText() == "Madison":

make_this_my_store_button = driver.find_elements_by_xpath("//button[@class='btn btn-small btn-make-store']")

driver.execute_script("arguments[0].click();", make_this_my_store_button[18])

#print("store has been selected")

except TimeoutException:

print("store selection failed")

return

driver.close()

# click Shop

try:

shop = driver.find_element_by_xpath("/html/body/div[1]/div/header/div/nav/section[2]/section[1]/div/div/div[4]/section/a/div[2]")

driver.execute_script("arguments[0].click();", shop)

per_page_string = "/html/body/div[1]/div/main/section/div[6]/div/div[1]/div/div/div[3]/label[1]/span[2]/button/span[1]"

wait.until(expected_conditions.presence_of_all_elements_located((By.XPATH, per_page_string)));

per_page = driver.find_element_by_xpath(per_page_string)

driver.execute_script("arguments[0].click();", per_page)

forty_eight = driver.find_element_by_xpath("/html/body/div[1]/div/main/section/div[6]/div/div[1]/div/div/div[3]/label[1]/span[2]/span/span/span[5]/a")

driver.execute_script("arguments[0].click();", forty_eight)

#print("shop has been clicked")

except TimeoutException:

print("shop could not be clicked")

return

driver.close()

element_present = expected_conditions.presence_of_all_elements_located((By.XPATH, "/html/body/div[1]/div/main/section/div[6]/div/div[3]/div[2]/div[2]/ul/li[1]/div/div[2]/div[2]/div[1]/a"))

try:

wait.until(element_present);

#print("found product list")

except TimeoutException:

print("Could not scan FestivalFoods for:", end = " ")

print("the product results")

print(product)

print("\n")

return

driver.close()

pageCount = 1

#time.sleep(10)

continue_while_loop = True

while continue_while_loop:

# go to the next page

try:

#print("I've either reached the end of the products or I had some trouble finding the page link.")

nextPageLink = driver.find_element_by_link_text(str(pageCount))

signUpForOurMobileClubButton = driver.find_element_by_xpath("/html/body/div[1]/div/footer/div[1]/div/div[2]/ul/li[1]/button/span")

ActionChains(driver).move_to_element(signUpForOurMobileClubButton).click(nextPageLink).perform()

if pageCount == 100: