Merge pull request #90 from FrostFlame/89-refactoring

89-refactoring (xpaths and selectors are now in selectors.json)
2020-03-18 15:15:04 +05:00 · 2020-03-18 15:15:04 +05:00 · e64ed73afb
parent 98b3f23bb3 59801b4ea5
commit e64ed73afb
3 changed files with 66 additions and 30 deletions
--- a/scraper/main.py
+++ b/scraper/main.py
@ -1,3 +1,3 @@
-from .scraper import scrapper
+from .scraper import scraper

-scrapper()
+scraper()
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@ -1,4 +1,5 @@
 import calendar
+import json
 import os
 import platform
 import sys
@ -34,9 +35,13 @@ total_scrolls = 2500
 current_scrolls = 0
 scroll_time = 8

+with open("selectors.json") as json_file:
+    selectors = json.load(json_file)
+
 old_height = 0
-firefox_profile_path = "/home/zeryx/.mozilla/firefox/0n8gmjoz.bot"
-facebook_https_prefix = "https://"
+firefox_profile_path = selectors.get("firefox_profile_path")
+facebook_https_prefix = selectors.get("facebook_https_prefix")
+facebook_link_body = selectors.get("facebook_link_body")


 CHROMEDRIVER_BINARIES_FOLDER = "bin"
@ -57,9 +62,13 @@ def get_facebook_images_url(img_links):
            try:
                while not valid_url_found:
                    WebDriverWait(driver, 30).until(
-                        EC.presence_of_element_located((By.CLASS_NAME, "spotlight"))
+                        EC.presence_of_element_located(
+                            (By.CLASS_NAME, selectors.get("spotlight"))
+                        )
+                    )
+                    element = driver.find_element_by_class_name(
+                        selectors.get("spotlight")
                    )
-                    element = driver.find_element_by_class_name("spotlight")
                    img_url = element.get_attribute("src")

                    if img_url.find(".gif") == -1:
@ -96,7 +105,7 @@ def image_downloader(img_links, folder_name):
                img_name = (link.split(".jpg")[0]).split("/")[-1] + ".jpg"

                # this is the image id when there's no profile pic
-                if img_name == "10354686_10150004552801856_220367501106153455_n.jpg":
+                if img_name == selectors.get("default_image"):
                    img_name = "None"
                else:
                    try:
@ -118,7 +127,7 @@ def image_downloader(img_links, folder_name):


 def check_height():
-    new_height = driver.execute_script("return document.body.scrollHeight")
+    new_height = driver.execute_script(selectors.get("height_script"))
    return new_height != old_height


@ -135,8 +144,8 @@ def scroll():
            if current_scrolls == total_scrolls:
                return

-            old_height = driver.execute_script("return document.body.scrollHeight")
-            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            old_height = driver.execute_script(selectors.get("height_script"))
+            driver.execute_script(selectors.get("scroll_script"))
            WebDriverWait(driver, scroll_time, 0.05).until(
                lambda driver: check_height()
            )
@ -157,11 +166,11 @@ def get_status(x):
    status = ""
    try:
        status = x.find_element_by_xpath(
-            ".//div[@class='_5wj-']"
+            selectors.get("status")
        ).text  # use _1xnd for Pages
    except Exception:
        try:
-            status = x.find_element_by_xpath(".//div[@class='userContent']").text
+            status = x.find_element_by_xpath(selectors.get("status_exc")).text
        except Exception:
            pass
    return status
@ -169,7 +178,7 @@ def get_status(x):

 def get_div_links(x, tag):
    try:
-        temp = x.find_element_by_xpath(".//div[@class='_3x-2']")
+        temp = x.find_element_by_xpath(selectors.get("temp"))
        return temp.find_element_by_tag_name(tag)
    except Exception:
        return ""
@ -183,13 +192,13 @@ def get_title_links(title):
 def get_title(x):
    title = ""
    try:
-        title = x.find_element_by_xpath(".//span[@class='fwb fcg']")
+        title = x.find_element_by_xpath(selectors.get("title"))
    except Exception:
        try:
-            title = x.find_element_by_xpath(".//span[@class='fcg']")
+            title = x.find_element_by_xpath(selectors.get("title_exc1"))
        except Exception:
            try:
-                title = x.find_element_by_xpath(".//span[@class='fwn fcg']")
+                title = x.find_element_by_xpath(selectors.get("title_exc2"))
            except Exception:
                pass
    finally:
@ -253,13 +262,13 @@ def extract_and_write_posts(elements, filename):
                # title
                title = get_title(x)
                if title.text.find("shared a memory") != -1:
-                    x = x.find_element_by_xpath(".//div[@class='_1dwg _1w_m']")
+                    x = x.find_element_by_xpath(selectors.get("title_element"))
                    title = get_title(x)

                status = get_status(x)
                if (
                    title.text
-                    == driver.find_element_by_id("fb-timeline-cover-name").text
+                    == driver.find_element_by_id(selectors.get("title_text")).text
                ):
                    if status == "":
                        temp = get_div_links(x, "img")
@ -385,11 +394,14 @@ def save_to_file(name, elements, status, current_section):
                                driver.get(friend)
                                WebDriverWait(driver, 30).until(
                                    EC.presence_of_element_located(
-                                        (By.CLASS_NAME, "profilePicThumb")
+                                        (
+                                            By.CLASS_NAME,
+                                            selectors.get("profilePicThumb"),
+                                        )
                                    )
                                )
                                l = driver.find_element_by_class_name(
-                                    "profilePicThumb"
+                                    selectors.get("profilePicThumb")
                                ).get_attribute("href")
                            except Exception:
                                l = "None"
@ -439,7 +451,7 @@ def save_to_file(name, elements, status, current_section):
                if download_uploaded_photos:
                    if photos_small_size:
                        background_img_links = driver.find_elements_by_xpath(
-                            "//*[contains(@id, 'pic_')]/div/i"
+                            selectors.get("background_img_links")
                        )
                        background_img_links = [
                            x.get_attribute("style") for x in background_img_links
@ -479,7 +491,7 @@ def save_to_file(name, elements, status, current_section):
            try:
                if results[0][0] == "/":
                    results = [r.pop(0) for r in results]
-                    results = [("https://en-gb.facebook.com/" + x) for x in results]
+                    results = [(selectors.get("fb_link") + x) for x in results]
            except Exception:
                pass

@ -553,7 +565,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na

                # the bar which contains all the sections
                sections_bar = driver.find_element_by_xpath(
-                    "//*[@class='_3cz'][1]/div[2]/div[1]"
+                    selectors.get("sections_bar")
                )

                if sections_bar.text.find(scan_list[i]) == -1:
@ -582,7 +594,9 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na

 def create_original_link(url):
    if url.find(".php") != -1:
-        original_link = facebook_https_prefix + ".facebook.com/" + ((url.split("="))[1])
+        original_link = (
+            facebook_https_prefix + facebook_link_body + ((url.split("="))[1])
+        )

        if original_link.find("&") != -1:
            original_link = original_link.split("&")[0]
@ -590,13 +604,13 @@ def create_original_link(url):
    elif url.find("fnr_t") != -1:
        original_link = (
            facebook_https_prefix
-            + ".facebook.com/"
+            + facebook_link_body
            + ((url.split("/"))[-1].split("?")[0])
        )
    elif url.find("_tab") != -1:
        original_link = (
            facebook_https_prefix
-            + ".facebook.com/"
+            + facebook_link_body
            + (url.split("?")[0]).split("/")[-1]
        )
    else:
@ -761,6 +775,7 @@ def scrap_profile(ids):
    # ----------------------------------------------------------------------------

    print("\nProcess Completed.")
+    os.chdir("../..")

    return

@ -816,7 +831,7 @@ def login(email, password):
            )
            exit(1)

-        fb_path = facebook_https_prefix + "facebook.com"
+        fb_path = facebook_https_prefix + facebook_link_body
        driver.get(fb_path)
        driver.maximize_window()

@ -858,7 +873,7 @@ def login(email, password):
 # -----------------------------------------------------------------------------


-def scrapper(**kwargs):
+def scraper(**kwargs):
    with open("credentials.yaml", "r") as ymlfile:
        cfg = yaml.safe_load(stream=ymlfile)

@ -867,7 +882,7 @@ def scrapper(**kwargs):
        exit(1)

    ids = [
-        facebook_https_prefix + "facebook.com/" + line.split("/")[-1]
+        facebook_https_prefix + facebook_link_body + line.split("/")[-1]
        for line in open("input.txt", newline="\n")
    ]

@ -887,4 +902,4 @@ def scrapper(**kwargs):

 if __name__ == "__main__":
    # get things rolling
-    scrapper()
+    scraper()
--- a/selectors.json
+++ b/selectors.json
@ -0,0 +1,21 @@
+{
+  "status": ".//div[@class='_5wj-']",
+  "sections_bar": "//*[@class='_3cz'][1]/div[2]/div[1]",
+  "status_exc": ".//div[@class='userContent']",
+  "temp": ".//div[@class='_3x-2']",
+  "title": ".//span[@class='fwb fcg']",
+  "title_exc1": ".//span[@class='fcg']",
+  "title_exc2": ".//span[@class='fwn fcg']",
+  "title_element": ".//div[@class='_1dwg _1w_m']",
+  "background_img_links": "//*[contains(@id, 'pic_')]/div/i",
+  "firefox_profile_path": "/home/zeryx/.mozilla/firefox/0n8gmjoz.bot",
+  "facebook_https_prefix": "https://",
+  "facebook_link_body": ".facebook.com/",
+  "spotlight": "spotlight",
+  "default_image": "10354686_10150004552801856_220367501106153455_n.jpg",
+  "height_script": "return document.body.scrollHeight",
+  "scroll_script": "window.scrollTo(0, document.body.scrollHeight);",
+  "title_text": "fb-timeline-cover-name",
+  "profilePicThumb": "profilePicThumb",
+  "fb_link": "https://en-gb.facebook.com/"
+}