Upload
This commit is contained in:
commit
f790ac8549
7 changed files with 112442 additions and 0 deletions
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
out_raw.csv
|
||||
|
||||
geckodriver
|
||||
|
||||
tmp/
|
||||
.vscode/
|
||||
|
||||
0
__init__.py
Normal file
0
__init__.py
Normal file
60
browser.py
Normal file
60
browser.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
import time
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
||||
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
class Browser:
|
||||
DRIVER_PATH = r'./geckodriver'
|
||||
TIMEOUT = 5
|
||||
|
||||
def __init__(self, headless=True):
|
||||
self.uri = None
|
||||
self.headless = headless
|
||||
self.options = FirefoxOptions()
|
||||
self.options.headless = headless
|
||||
self.driver = webdriver.Firefox(executable_path=self.DRIVER_PATH, options=self.options)
|
||||
|
||||
def open_uri(self, uri, allow_refresh=False):
|
||||
if self.driver.current_url != uri or allow_refresh:
|
||||
self.driver.get(uri)
|
||||
self.find('/html', None, self.TIMEOUT)
|
||||
return True
|
||||
return False
|
||||
|
||||
def find(self, xpath, element=None, timeout=TIMEOUT):
|
||||
while True:
|
||||
try:
|
||||
if element is None:
|
||||
return WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located((By.XPATH, xpath)))
|
||||
return element.find_element_by_xpath(xpath)
|
||||
except TimeoutException:
|
||||
return None
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
|
||||
def click(self, xpath, element=None, timeout=TIMEOUT):
|
||||
try:
|
||||
WebDriverWait(self.driver, timeout).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() if element is None else element.click()
|
||||
return True
|
||||
except TimeoutException:
|
||||
return False
|
||||
|
||||
def get_attribute(self, xpath, name, element=None, timeout=TIMEOUT):
|
||||
try:
|
||||
if element is None:
|
||||
return WebDriverWait(self.driver, timeout).until(EC.visibility_of_element_located((By.XPATH, xpath))).get_attribute(name)
|
||||
return element.get_attribute(name)
|
||||
except TimeoutException:
|
||||
return None
|
||||
|
||||
def get_css_value(self, xpath, name, element=None, timeout=TIMEOUT):
|
||||
try:
|
||||
if element is None:
|
||||
return WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located((By.XPATH, xpath))).value_of_css_property(name)
|
||||
return element.value_of_css_property(name)
|
||||
except TimeoutException:
|
||||
return None
|
||||
29
conf.json
Normal file
29
conf.json
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
{
|
||||
"output_file": "contacts.csv",
|
||||
"output_header": "Name,Email Address,Chat Address,Work Phone,Job Title,Department,Office Location,Company,Profile Picture",
|
||||
"screenshot": ".screenshot.png",
|
||||
"auth": {
|
||||
"username": "91816****",
|
||||
"password": "******************"
|
||||
},
|
||||
"path": {
|
||||
"login_url": "https://live.sfsu.edu",
|
||||
"contact_url": "https://outlook.office365.com/people/",
|
||||
"username_txt": "//*[@id='username']",
|
||||
"password_txt": "//*[@id='password']",
|
||||
"login_btn": "/html/body/div/div/div/form/div[3]/button",
|
||||
"stay_signed_btn": "//*[@id='idBtn_Back']",
|
||||
"contact_dir_btn": "//button[contains(., 'All User')]",
|
||||
"first_contact": "//*[@class='ReactVirtualized__Grid__innerScrollContainer']//*[@role='listitem']//*",
|
||||
"name": "//*[@data-log-name='PersonName']",
|
||||
"company": "//*[contains(@title, 'Company')]//h4/following-sibling::*",
|
||||
"job_title": "//*[@data-log-name='JobTitle']",
|
||||
"department": "//*[@data-log-name='Department']",
|
||||
"email": "//*[contains(@title, 'Email')]//h4/following-sibling::*",
|
||||
"chat": "//*[contains(@title, 'Chat')]//h4/following-sibling::*",
|
||||
"mobile": "//*[contains(@title, 'Mobile')]//h4/following-sibling::*",
|
||||
"work_phone": "//*[contains(@title, 'Work phone')]//h4/following-sibling::*",
|
||||
"office_location": "//*[contains(@title, 'Office location')]//h4/following-sibling::*",
|
||||
"profile_picture": "//img[contains(@alt, 'Profile picture')]"
|
||||
}
|
||||
}
|
||||
10
conf.py
Normal file
10
conf.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
import json
|
||||
|
||||
class Conf:
|
||||
|
||||
def __init__(self, filename):
|
||||
with open(filename) as f:
|
||||
self.data = json.load(f)
|
||||
|
||||
def __str__(self):
|
||||
print(json.dumps(self.data, indent=2, sort_keys=True))
|
||||
94
contacts.py
Normal file
94
contacts.py
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
import base64
|
||||
import csv
|
||||
import datetime
|
||||
import time
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
from conf import Conf
|
||||
from browser import Browser
|
||||
|
||||
# Load configuration into variables
|
||||
conf = Conf('./conf.json').data
|
||||
auth = conf['auth']
|
||||
xpath = conf['path']
|
||||
|
||||
# Authenticate
|
||||
browser = Browser(headless=False)
|
||||
browser.open_uri(xpath['login_url'])
|
||||
|
||||
# Enter username and password
|
||||
username = browser.find(xpath['username_txt'])
|
||||
username.clear()
|
||||
username.send_keys(auth['username'])
|
||||
password = browser.find(xpath['password_txt'])
|
||||
password.clear()
|
||||
password.send_keys(auth['password'])
|
||||
# Sign in
|
||||
browser.click(xpath['login_btn'])
|
||||
browser.click(xpath['stay_signed_btn'])
|
||||
# Open contacts
|
||||
browser.open_uri(xpath['contact_url'])
|
||||
browser.click(xpath['contact_dir_btn'])
|
||||
# Set focus on the first item in contacts list
|
||||
first_contact = browser.find(xpath['first_contact'])
|
||||
first_contact.click()
|
||||
|
||||
# TODO: To automatically locate the first contract on headless browser, use driver.execute_scrpt().scrollBy
|
||||
screenshot_filename = input("Enter filname of screenshot: ")
|
||||
output_filename = input("Enter filename of output csv file: ")
|
||||
input("Click on the first contact and press Enter to continue...")
|
||||
|
||||
with open(output_filename, 'a+', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
# If output file is empty, create header
|
||||
if sum(1 for line in f) == 0:
|
||||
f.write(conf['output_header'] + '\r\n')
|
||||
|
||||
while True:
|
||||
time.sleep(browser.TIMEOUT)
|
||||
|
||||
# Current contact has :focus
|
||||
curr_contact = browser.driver.switch_to.active_element
|
||||
|
||||
# Scrape current contact
|
||||
name = browser.find(xpath['name'])
|
||||
email = browser.find(xpath['email'])
|
||||
chat = browser.find(xpath['chat'])
|
||||
mobile = browser.find(xpath['mobile'])
|
||||
work_phone = browser.find(xpath['work_phone'])
|
||||
job_title = browser.find(xpath['job_title'])
|
||||
department = browser.find(xpath['department'])
|
||||
office_location = browser.find(xpath['office_location'])
|
||||
company = browser.find(xpath['company'])
|
||||
profile_pic = browser.find(xpath['profile_picture'])
|
||||
if profile_pic is not None:
|
||||
# Save profile picture to local file and convert to base64
|
||||
profile_pic.screenshot(screenshot_filename)
|
||||
with open(screenshot_filename, 'rb') as p:
|
||||
profile_pic = base64.b64encode(p.read()).decode('utf-8')
|
||||
|
||||
# Append current contact to output file
|
||||
data = [
|
||||
'' if name is None else name.text,
|
||||
'' if email is None else email.text,
|
||||
'' if chat is None else chat.text,
|
||||
'' if mobile is None else mobile.text,
|
||||
'' if work_phone is None else work_phone.text,
|
||||
'' if job_title is None else job_title.text,
|
||||
'' if department is None else department.text,
|
||||
'' if office_location is None else office_location.text,
|
||||
'' if company is None else company.text,
|
||||
'' if profile_pic is None else profile_pic
|
||||
]
|
||||
|
||||
# Write current contact to output file
|
||||
writer.writerow(data)
|
||||
f.flush()
|
||||
# Print current contact
|
||||
print(datetime.datetime.now(), name.text, name.email)
|
||||
|
||||
# Set :focus on the next contact
|
||||
curr_contact.send_keys(Keys.DOWN)
|
||||
112242
out.csv
Normal file
112242
out.csv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue