Merge pull request #90 from FrostFlame/89-refactoring

89-refactoring (xpaths and selectors are now in selectors.json)
This commit is contained in:
Haris Muneer ⚡️ 2020-03-18 15:15:04 +05:00 committed by GitHub
commit e64ed73afb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 66 additions and 30 deletions

View File

@ -1,3 +1,3 @@
from .scraper import scrapper
from .scraper import scraper
scrapper()
scraper()

View File

@ -1,4 +1,5 @@
import calendar
import json
import os
import platform
import sys
@ -34,9 +35,13 @@ total_scrolls = 2500
current_scrolls = 0
scroll_time = 8
with open("selectors.json") as json_file:
selectors = json.load(json_file)
old_height = 0
firefox_profile_path = "/home/zeryx/.mozilla/firefox/0n8gmjoz.bot"
facebook_https_prefix = "https://"
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
CHROMEDRIVER_BINARIES_FOLDER = "bin"
@ -57,9 +62,13 @@ def get_facebook_images_url(img_links):
try:
while not valid_url_found:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, "spotlight"))
EC.presence_of_element_located(
(By.CLASS_NAME, selectors.get("spotlight"))
)
)
element = driver.find_element_by_class_name(
selectors.get("spotlight")
)
element = driver.find_element_by_class_name("spotlight")
img_url = element.get_attribute("src")
if img_url.find(".gif") == -1:
@ -96,7 +105,7 @@ def image_downloader(img_links, folder_name):
img_name = (link.split(".jpg")[0]).split("/")[-1] + ".jpg"
# this is the image id when there's no profile pic
if img_name == "10354686_10150004552801856_220367501106153455_n.jpg":
if img_name == selectors.get("default_image"):
img_name = "None"
else:
try:
@ -118,7 +127,7 @@ def image_downloader(img_links, folder_name):
def check_height():
new_height = driver.execute_script("return document.body.scrollHeight")
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
@ -135,8 +144,8 @@ def scroll():
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height()
)
@ -157,11 +166,11 @@ def get_status(x):
status = ""
try:
status = x.find_element_by_xpath(
".//div[@class='_5wj-']"
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(".//div[@class='userContent']").text
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
@ -169,7 +178,7 @@ def get_status(x):
def get_div_links(x, tag):
try:
temp = x.find_element_by_xpath(".//div[@class='_3x-2']")
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
@ -183,13 +192,13 @@ def get_title_links(title):
def get_title(x):
title = ""
try:
title = x.find_element_by_xpath(".//span[@class='fwb fcg']")
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(".//span[@class='fcg']")
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(".//span[@class='fwn fcg']")
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
@ -253,13 +262,13 @@ def extract_and_write_posts(elements, filename):
# title
title = get_title(x)
if title.text.find("shared a memory") != -1:
x = x.find_element_by_xpath(".//div[@class='_1dwg _1w_m']")
x = x.find_element_by_xpath(selectors.get("title_element"))
title = get_title(x)
status = get_status(x)
if (
title.text
== driver.find_element_by_id("fb-timeline-cover-name").text
== driver.find_element_by_id(selectors.get("title_text")).text
):
if status == "":
temp = get_div_links(x, "img")
@ -385,11 +394,14 @@ def save_to_file(name, elements, status, current_section):
driver.get(friend)
WebDriverWait(driver, 30).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "profilePicThumb")
(
By.CLASS_NAME,
selectors.get("profilePicThumb"),
)
)
)
l = driver.find_element_by_class_name(
"profilePicThumb"
selectors.get("profilePicThumb")
).get_attribute("href")
except Exception:
l = "None"
@ -439,7 +451,7 @@ def save_to_file(name, elements, status, current_section):
if download_uploaded_photos:
if photos_small_size:
background_img_links = driver.find_elements_by_xpath(
"//*[contains(@id, 'pic_')]/div/i"
selectors.get("background_img_links")
)
background_img_links = [
x.get_attribute("style") for x in background_img_links
@ -479,7 +491,7 @@ def save_to_file(name, elements, status, current_section):
try:
if results[0][0] == "/":
results = [r.pop(0) for r in results]
results = [("https://en-gb.facebook.com/" + x) for x in results]
results = [(selectors.get("fb_link") + x) for x in results]
except Exception:
pass
@ -553,7 +565,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
# the bar which contains all the sections
sections_bar = driver.find_element_by_xpath(
"//*[@class='_3cz'][1]/div[2]/div[1]"
selectors.get("sections_bar")
)
if sections_bar.text.find(scan_list[i]) == -1:
@ -582,7 +594,9 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
def create_original_link(url):
if url.find(".php") != -1:
original_link = facebook_https_prefix + ".facebook.com/" + ((url.split("="))[1])
original_link = (
facebook_https_prefix + facebook_link_body + ((url.split("="))[1])
)
if original_link.find("&") != -1:
original_link = original_link.split("&")[0]
@ -590,13 +604,13 @@ def create_original_link(url):
elif url.find("fnr_t") != -1:
original_link = (
facebook_https_prefix
+ ".facebook.com/"
+ facebook_link_body
+ ((url.split("/"))[-1].split("?")[0])
)
elif url.find("_tab") != -1:
original_link = (
facebook_https_prefix
+ ".facebook.com/"
+ facebook_link_body
+ (url.split("?")[0]).split("/")[-1]
)
else:
@ -761,6 +775,7 @@ def scrap_profile(ids):
# ----------------------------------------------------------------------------
print("\nProcess Completed.")
os.chdir("../..")
return
@ -816,7 +831,7 @@ def login(email, password):
)
exit(1)
fb_path = facebook_https_prefix + "facebook.com"
fb_path = facebook_https_prefix + facebook_link_body
driver.get(fb_path)
driver.maximize_window()
@ -858,7 +873,7 @@ def login(email, password):
# -----------------------------------------------------------------------------
def scrapper(**kwargs):
def scraper(**kwargs):
with open("credentials.yaml", "r") as ymlfile:
cfg = yaml.safe_load(stream=ymlfile)
@ -867,7 +882,7 @@ def scrapper(**kwargs):
exit(1)
ids = [
facebook_https_prefix + "facebook.com/" + line.split("/")[-1]
facebook_https_prefix + facebook_link_body + line.split("/")[-1]
for line in open("input.txt", newline="\n")
]
@ -887,4 +902,4 @@ def scrapper(**kwargs):
if __name__ == "__main__":
# get things rolling
scrapper()
scraper()

21
selectors.json Normal file
View File

@ -0,0 +1,21 @@
{
"status": ".//div[@class='_5wj-']",
"sections_bar": "//*[@class='_3cz'][1]/div[2]/div[1]",
"status_exc": ".//div[@class='userContent']",
"temp": ".//div[@class='_3x-2']",
"title": ".//span[@class='fwb fcg']",
"title_exc1": ".//span[@class='fcg']",
"title_exc2": ".//span[@class='fwn fcg']",
"title_element": ".//div[@class='_1dwg _1w_m']",
"background_img_links": "//*[contains(@id, 'pic_')]/div/i",
"firefox_profile_path": "/home/zeryx/.mozilla/firefox/0n8gmjoz.bot",
"facebook_https_prefix": "https://",
"facebook_link_body": ".facebook.com/",
"spotlight": "spotlight",
"default_image": "10354686_10150004552801856_220367501106153455_n.jpg",
"height_script": "return document.body.scrollHeight",
"scroll_script": "window.scrollTo(0, document.body.scrollHeight);",
"title_text": "fb-timeline-cover-name",
"profilePicThumb": "profilePicThumb",
"fb_link": "https://en-gb.facebook.com/"
}