This commit is contained in:
Jay 2020-11-04 01:13:41 +09:00
commit f790ac8549
7 changed files with 112442 additions and 0 deletions

7
.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
out_raw.csv
geckodriver
tmp/
.vscode/

0
__init__.py Normal file
View file

60
browser.py Normal file
View file

@ -0,0 +1,60 @@
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class Browser:
DRIVER_PATH = r'./geckodriver'
TIMEOUT = 5
def __init__(self, headless=True):
self.uri = None
self.headless = headless
self.options = FirefoxOptions()
self.options.headless = headless
self.driver = webdriver.Firefox(executable_path=self.DRIVER_PATH, options=self.options)
def open_uri(self, uri, allow_refresh=False):
if self.driver.current_url != uri or allow_refresh:
self.driver.get(uri)
self.find('/html', None, self.TIMEOUT)
return True
return False
def find(self, xpath, element=None, timeout=TIMEOUT):
while True:
try:
if element is None:
return WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located((By.XPATH, xpath)))
return element.find_element_by_xpath(xpath)
except TimeoutException:
return None
except StaleElementReferenceException:
continue
def click(self, xpath, element=None, timeout=TIMEOUT):
try:
WebDriverWait(self.driver, timeout).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() if element is None else element.click()
return True
except TimeoutException:
return False
def get_attribute(self, xpath, name, element=None, timeout=TIMEOUT):
try:
if element is None:
return WebDriverWait(self.driver, timeout).until(EC.visibility_of_element_located((By.XPATH, xpath))).get_attribute(name)
return element.get_attribute(name)
except TimeoutException:
return None
def get_css_value(self, xpath, name, element=None, timeout=TIMEOUT):
try:
if element is None:
return WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located((By.XPATH, xpath))).value_of_css_property(name)
return element.value_of_css_property(name)
except TimeoutException:
return None

29
conf.json Normal file
View file

@ -0,0 +1,29 @@
{
"output_file": "contacts.csv",
"output_header": "Name,Email Address,Chat Address,Work Phone,Job Title,Department,Office Location,Company,Profile Picture",
"screenshot": ".screenshot.png",
"auth": {
"username": "91816****",
"password": "******************"
},
"path": {
"login_url": "https://live.sfsu.edu",
"contact_url": "https://outlook.office365.com/people/",
"username_txt": "//*[@id='username']",
"password_txt": "//*[@id='password']",
"login_btn": "/html/body/div/div/div/form/div[3]/button",
"stay_signed_btn": "//*[@id='idBtn_Back']",
"contact_dir_btn": "//button[contains(., 'All User')]",
"first_contact": "//*[@class='ReactVirtualized__Grid__innerScrollContainer']//*[@role='listitem']//*",
"name": "//*[@data-log-name='PersonName']",
"company": "//*[contains(@title, 'Company')]//h4/following-sibling::*",
"job_title": "//*[@data-log-name='JobTitle']",
"department": "//*[@data-log-name='Department']",
"email": "//*[contains(@title, 'Email')]//h4/following-sibling::*",
"chat": "//*[contains(@title, 'Chat')]//h4/following-sibling::*",
"mobile": "//*[contains(@title, 'Mobile')]//h4/following-sibling::*",
"work_phone": "//*[contains(@title, 'Work phone')]//h4/following-sibling::*",
"office_location": "//*[contains(@title, 'Office location')]//h4/following-sibling::*",
"profile_picture": "//img[contains(@alt, 'Profile picture')]"
}
}

10
conf.py Normal file
View file

@ -0,0 +1,10 @@
import json
class Conf:
def __init__(self, filename):
with open(filename) as f:
self.data = json.load(f)
def __str__(self):
print(json.dumps(self.data, indent=2, sort_keys=True))

94
contacts.py Normal file
View file

@ -0,0 +1,94 @@
import base64
import csv
import datetime
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from conf import Conf
from browser import Browser
# Load configuration into variables
conf = Conf('./conf.json').data
auth = conf['auth']
xpath = conf['path']
# Authenticate
browser = Browser(headless=False)
browser.open_uri(xpath['login_url'])
# Enter username and password
username = browser.find(xpath['username_txt'])
username.clear()
username.send_keys(auth['username'])
password = browser.find(xpath['password_txt'])
password.clear()
password.send_keys(auth['password'])
# Sign in
browser.click(xpath['login_btn'])
browser.click(xpath['stay_signed_btn'])
# Open contacts
browser.open_uri(xpath['contact_url'])
browser.click(xpath['contact_dir_btn'])
# Set focus on the first item in contacts list
first_contact = browser.find(xpath['first_contact'])
first_contact.click()
# TODO: To automatically locate the first contract on headless browser, use driver.execute_scrpt().scrollBy
screenshot_filename = input("Enter filname of screenshot: ")
output_filename = input("Enter filename of output csv file: ")
input("Click on the first contact and press Enter to continue...")
with open(output_filename, 'a+', newline='') as f:
writer = csv.writer(f)
# If output file is empty, create header
if sum(1 for line in f) == 0:
f.write(conf['output_header'] + '\r\n')
while True:
time.sleep(browser.TIMEOUT)
# Current contact has :focus
curr_contact = browser.driver.switch_to.active_element
# Scrape current contact
name = browser.find(xpath['name'])
email = browser.find(xpath['email'])
chat = browser.find(xpath['chat'])
mobile = browser.find(xpath['mobile'])
work_phone = browser.find(xpath['work_phone'])
job_title = browser.find(xpath['job_title'])
department = browser.find(xpath['department'])
office_location = browser.find(xpath['office_location'])
company = browser.find(xpath['company'])
profile_pic = browser.find(xpath['profile_picture'])
if profile_pic is not None:
# Save profile picture to local file and convert to base64
profile_pic.screenshot(screenshot_filename)
with open(screenshot_filename, 'rb') as p:
profile_pic = base64.b64encode(p.read()).decode('utf-8')
# Append current contact to output file
data = [
'' if name is None else name.text,
'' if email is None else email.text,
'' if chat is None else chat.text,
'' if mobile is None else mobile.text,
'' if work_phone is None else work_phone.text,
'' if job_title is None else job_title.text,
'' if department is None else department.text,
'' if office_location is None else office_location.text,
'' if company is None else company.text,
'' if profile_pic is None else profile_pic
]
# Write current contact to output file
writer.writerow(data)
f.flush()
# Print current contact
print(datetime.datetime.now(), name.text, name.email)
# Set :focus on the next contact
curr_contact.send_keys(Keys.DOWN)

112242
out.csv Normal file

File diff suppressed because one or more lines are too long