Merge pull request #96 from elyesmanai/enhancements

refactoring
This commit is contained in:
Haris Muneer ⚡️ 2020-03-28 21:32:49 +05:00 committed by GitHub
commit 4ed08fe3d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 393 additions and 306 deletions

View File

@ -27,7 +27,7 @@
<a href="https://www.github.com/harismuneer/Ultimate-Facebook-Scraper/fork">
<img src="https://img.shields.io/github/forks/harismuneer/Ultimate-Facebook-Scraper.svg?style=social&label=Fork&maxAge=2592000" />
</a>
<a href="#">
<a href="#">
<img src="https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat&label=Contributions&colorA=red&colorB=black" />
</a>
</p>
@ -176,17 +176,28 @@ The tool uses latest version of [Chrome Web Driver](http://chromedriver.chromium
Make sure the link only contains the username or id number at the end and not any other stuff. Make sure its in the format mentioned above.
> Note: There are two modes to download Friends Profile Pics and the user's Photos: Large Size and Small Size. You can change the following variables in [`scraper/scraper.py`](scraper/scraper.py#L30). By default they are set to Small Sized Pics because its really quick while Large Size Mode takes time depending on the number of pictures to download
Run the `ultimate-facebook-scraper` command ! 🚀
```python
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its False then it will open each photo to download it
# and it will take much more time
friends_small_size = True
photos_small_size = True
$ python scraper/scraper.py
```
Run the `ultimate-facebook-scraper` command ! 🚀
> Note: There are two modes to download Friends Profile Pics and the user's Photos: Large Size and Small Size. By default they are set to Small Sized Pics because its really quick while Large Size Mode takes time depending on the number of pictures to download.
You can personalize your scrapping needs using the command line arguments:
```python
$ python scraper/scraper.py \
--uploaded_photos True \
--friends_photos True \
--friends_small_size True \
--photos_small_size True \
--total_scrolls 2500 \
--scroll_time 8
```
Note that those are the default values so no need to write them down if you're just testing or are okay with them.
---

121
params.json Normal file
View File

@ -0,0 +1,121 @@
{
"Friends": {
"scan_list": [
"All",
"Mutual Friends",
"Following",
"Followers",
"Work",
"College",
"Current City",
"Hometown"
],
"section": [
"/friends",
"/friends_mutual",
"/following",
"/followers",
"/friends_work",
"/friends_college",
"/friends_current_city",
"/friends_hometown"
],
"elements_path": [
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
],
"file_names": [
"All Friends.txt",
"Mutual Friends.txt",
"Following.txt",
"Followers.txt",
"Work Friends.txt",
"College Friends.txt",
"Current City Friends.txt",
"Hometown Friends.txt"
],
"save_status":0
},
"Photos": {
"scan_list": [
"'s Photos",
"Photos of"
],
"section": [
"/photos_all",
"/photos_of"
],
"elements_path": [
"//*[contains(@id, 'pic_')]",
"//*[contains(@id, 'pic_')]"
],
"file_names": [
"Uploaded Photos.txt",
"Tagged Photos.txt"
],
"save_status": 1
},
"Videos": {
"scan_list": [
"'s Videos",
"Videos of"
],
"section": [
"/videos_by",
"/videos_of"
],
"elements_path": [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
],
"file_names": [
"Uploaded Videos.txt",
"Tagged Videos.txt"
],
"save_status": 2
},
"About": {
"scan_list": [],
"section": [
"/about?section=overview",
"/about?section=education",
"/about?section=living",
"/about?section=contact-info",
"/about?section=relationship",
"/about?section=bio",
"/about?section=year-overviews"
],
"elements_path": [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
],
"file_names": [
"Overview.txt",
"Work and Education.txt",
"Places Lived.txt",
"Contact and Basic Info.txt",
"Family and Relationships.txt",
"Details About.txt",
"Life Events.txt"
],
"save_status": 3
},
"Posts": {
"scan_list": [],
"section": [],
"elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
"file_names": ["Posts.txt"],
"save_status": 4
}
}

View File

@ -5,6 +5,8 @@ import platform
import sys
import urllib.request
import yaml
import utils
import argparse
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# -------------------------------------------------------------
# -------------------------------------------------------------
# Global Variables
driver = None
# whether to download photos or not
download_uploaded_photos = True
download_friends_photos = True
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its false then it will open each photo to download it
# and it will take much more time
friends_small_size = True
photos_small_size = True
total_scrolls = 2500
current_scrolls = 0
scroll_time = 8
with open("selectors.json") as json_file:
selectors = json.load(json_file)
old_height = 0
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
CHROMEDRIVER_BINARIES_FOLDER = "bin"
# -------------------------------------------------------------
# -------------------------------------------------------------
def get_facebook_images_url(img_links):
urls = []
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
parent = os.getcwd()
try:
folder = os.path.join(os.getcwd(), folder_name)
create_folder(folder)
utils.create_folder(folder)
os.chdir(folder)
except Exception:
print("Error in changing directory.")
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
# -------------------------------------------------------------
def check_height():
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
# -------------------------------------------------------------
# -------------------------------------------------------------
# helper function: used to scroll the page
def scroll():
global old_height
current_scrolls = 0
while True:
try:
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height()
)
current_scrolls += 1
except TimeoutException:
break
return
# -------------------------------------------------------------
# -------------------------------------------------------------
# --Helper Functions for Posts
def get_status(x):
status = ""
try:
status = x.find_element_by_xpath(
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
def get_div_links(x, tag):
try:
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
def get_title_links(title):
l = title.find_elements_by_tag_name("a")
return l[-1].text, l[-1].get_attribute("href")
def get_title(x):
title = ""
try:
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
return title
def get_time(x):
time = ""
try:
time = x.find_element_by_tag_name("abbr").get_attribute("title")
time = (
str("%02d" % int(time.split(", ")[1].split()[1]),)
+ "-"
+ str(
(
"%02d"
% (
int(
(
list(calendar.month_abbr).index(
time.split(", ")[1].split()[0][:3]
)
)
),
)
)
)
+ "-"
+ time.split()[3]
+ " "
+ str("%02d" % int(time.split()[5].split(":")[0]))
+ ":"
+ str(time.split()[5].split(":")[1])
)
except Exception:
pass
finally:
return time
def extract_and_write_posts(elements, filename):
try:
f = open(filename, "w", newline="\r\n")
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
time = " "
# time
time = get_time(x)
time = utils.get_time(x)
# title
title = get_title(x)
title = utils.get_title(x, selectors)
if title.text.find("shared a memory") != -1:
x = x.find_element_by_xpath(selectors.get("title_element"))
title = get_title(x)
title = utils.get_title(x, selectors)
status = get_status(x)
status = utils.get_status(x, selectors)
if (
title.text
== driver.find_element_by_id(selectors.get("title_text")).text
):
if status == "":
temp = get_div_links(x, "img")
temp = utils.get_div_links(x, "img", selectors)
if (
temp == ""
): # no image tag which means . it is not a life event
link = get_div_links(x, "a").get_attribute("href")
link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
type = "status update without text"
else:
type = "life event"
link = get_div_links(x, "a").get_attribute("href")
status = get_div_links(x, "a").text
link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
status = utils.get_div_links(x, "a", selectors).text
else:
type = "status update"
if get_div_links(x, "a") != "":
link = get_div_links(x, "a").get_attribute("href")
if utils.get_div_links(x, "a", selectors) != "":
link = utils.get_div_links(x, "a", selectors).get_attribute(
"href"
)
elif title.text.find(" shared ") != -1:
x1, link = get_title_links(title)
x1, link = utils.get_title_links(title)
type = "shared " + x1
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
if title.text.find(" at ") != -1:
x1, link = get_title_links(title)
x1, link = utils.get_title_links(title)
type = "check in"
elif title.text.find(" in ") != 1:
status = get_div_links(x, "a").text
status = utils.get_div_links(x, "a", selectors).text
elif (
title.text.find(" added ") != -1 and title.text.find("photo") != -1
):
type = "added photo"
link = get_div_links(x, "a").get_attribute("href")
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
elif (
title.text.find(" added ") != -1 and title.text.find("video") != -1
):
type = "added video"
link = get_div_links(x, "a").get_attribute("href")
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
else:
type = "others"
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
page = []
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
continue
if save_status != 3:
scroll()
utils.scroll(total_scrolls, driver, selectors, scroll_time)
data = driver.find_elements_by_xpath(elements_path[i])
@ -619,16 +477,9 @@ def create_original_link(url):
return original_link
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
def create_folder(folder):
if not os.path.exists(folder):
os.mkdir(folder)
def scrap_profile(ids):
folder = os.path.join(os.getcwd(), "data")
create_folder(folder)
utils.create_folder(folder)
os.chdir(folder)
# execute for all profiles given in input.txt file
@ -642,137 +493,34 @@ def scrap_profile(ids):
try:
target_dir = os.path.join(folder, user_id.split("/")[-1])
create_folder(target_dir)
utils.create_folder(target_dir)
os.chdir(target_dir)
except Exception:
print("Some error occurred in creating the profile directory.")
continue
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Friends..")
# setting parameters for scrape_data() to scrape friends
scan_list = [
"All",
"Mutual Friends",
"Following",
"Followers",
"Work",
"College",
"Current City",
"Hometown",
]
section = [
"/friends",
"/friends_mutual",
"/following",
"/followers",
"/friends_work",
"/friends_college",
"/friends_current_city",
"/friends_hometown",
]
elements_path = [
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
]
file_names = [
"All Friends.txt",
"Mutual Friends.txt",
"Following.txt",
"Followers.txt",
"Work Friends.txt",
"College Friends.txt",
"Current City Friends.txt",
"Hometown Friends.txt",
]
save_status = 0
to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
for item in to_scrap:
print("----------------------------------------")
print("Scraping {}..".format(item))
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Friends Done!")
if item == "Posts":
scan_list = [None]
elif item == "About":
scan_list = [None] * 7
else:
scan_list = params[item]["scan_list"]
# ----------------------------------------------------------------------------
section = params[item]["section"]
elements_path = params[item]["elements_path"]
file_names = params[item]["file_names"]
save_status = params[item]["save_status"]
print("----------------------------------------")
print("Photos..")
print("Scraping Links..")
# setting parameters for scrape_data() to scrap photos
scan_list = ["'s Photos", "Photos of"]
section = ["/photos_all", "/photos_of"]
elements_path = ["//*[contains(@id, 'pic_')]"] * 2
file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
save_status = 1
scrape_data(
user_id, scan_list, section, elements_path, save_status, file_names
)
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Photos Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Videos:")
# setting parameters for scrape_data() to scrap videos
scan_list = ["'s Videos", "Videos of"]
section = ["/videos_by", "/videos_of"]
elements_path = [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
] * 2
file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
save_status = 2
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Videos Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("About:")
# setting parameters for scrape_data() to scrap the about section
scan_list = [None] * 7
section = [
"/about?section=overview",
"/about?section=education",
"/about?section=living",
"/about?section=contact-info",
"/about?section=relationship",
"/about?section=bio",
"/about?section=year-overviews",
]
elements_path = [
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
] * 7
file_names = [
"Overview.txt",
"Work and Education.txt",
"Places Lived.txt",
"Contact and Basic Info.txt",
"Family and Relationships.txt",
"Details About.txt",
"Life Events.txt",
]
save_status = 3
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("About Section Done!")
# ----------------------------------------------------------------------------
print("----------------------------------------")
print("Posts:")
# setting parameters for scrape_data() to scrap posts
scan_list = [None]
section = []
elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]']
file_names = ["Posts.txt"]
save_status = 4
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
print("Posts(Statuses) Done!")
print("----------------------------------------")
# ----------------------------------------------------------------------------
print("{} Done!".format(item))
print("\nProcess Completed.")
os.chdir("../..")
@ -901,5 +649,73 @@ def scraper(**kwargs):
# -------------------------------------------------------------
if __name__ == "__main__":
ap = argparse.ArgumentParser()
# PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
ap.add_argument(
"-dup",
"--uploaded_photos",
help="download users' uploaded photos?",
default=True,
)
ap.add_argument(
"-dfp", "--friends_photos", help="download users' photos?", default=True
)
ap.add_argument(
"-fss",
"--friends_small_size",
help="Download friends pictures in small size?",
default=True,
)
ap.add_argument(
"-pss",
"--photos_small_size",
help="Download photos in small size?",
default=True,
)
ap.add_argument(
"-ts",
"--total_scrolls",
help="How many times should I scroll down?",
default=2500,
)
ap.add_argument(
"-st", "--scroll_time", help="How much time should I take to scroll?", default=8
)
args = vars(ap.parse_args())
print(args)
# ---------------------------------------------------------
# Global Variables
# ---------------------------------------------------------
# whether to download photos or not
download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
download_friends_photos = utils.to_bool(args["friends_photos"])
# whether to download the full image or its thumbnail (small size)
# if small size is True then it will be very quick else if its false then it will open each photo to download it
# and it will take much more time
friends_small_size = utils.to_bool(args["friends_small_size"])
photos_small_size = utils.to_bool(args["photos_small_size"])
total_scrolls = int(args["total_scrolls"])
scroll_time = int(args["scroll_time"])
current_scrolls = 0
old_height = 0
driver = None
CHROMEDRIVER_BINARIES_FOLDER = "bin"
with open("selectors.json") as a, open("params.json") as b:
selectors = json.load(a)
params = json.load(b)
firefox_profile_path = selectors.get("firefox_profile_path")
facebook_https_prefix = selectors.get("facebook_https_prefix")
facebook_link_body = selectors.get("facebook_link_body")
# get things rolling
scraper()

139
scraper/utils.py Normal file
View File

@ -0,0 +1,139 @@
import argparse
import os
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
# -----------------------------------------------------------------------------
#
# -----------------------------------------------------------------------------
def to_bool(x):
if x in ["False", "0", 0, False]:
return False
elif x in ["True", "1", 1, True]:
return True
else:
raise argparse.ArgumentTypeError("Boolean value expected")
# -----------------------------------------------------------------------------
#
# -----------------------------------------------------------------------------
def create_folder(folder):
if not os.path.exists(folder):
os.mkdir(folder)
# -------------------------------------------------------------
# Helper functions for Page scrolling
# -------------------------------------------------------------
# check if height changed
def check_height(driver, selectors, old_height):
new_height = driver.execute_script(selectors.get("height_script"))
return new_height != old_height
# helper function: used to scroll the page
def scroll(total_scrolls, driver, selectors, scroll_time):
global old_height
current_scrolls = 0
while True:
try:
if current_scrolls == total_scrolls:
return
old_height = driver.execute_script(selectors.get("height_script"))
driver.execute_script(selectors.get("scroll_script"))
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height(driver, selectors, old_height)
)
current_scrolls += 1
except TimeoutException:
break
return
# -----------------------------------------------------------------------------
# Helper Functions for Posts
# -----------------------------------------------------------------------------
def get_status(x, selectors):
status = ""
try:
status = x.find_element_by_xpath(
selectors.get("status")
).text # use _1xnd for Pages
except Exception:
try:
status = x.find_element_by_xpath(selectors.get("status_exc")).text
except Exception:
pass
return status
def get_div_links(x, tag, selectors):
try:
temp = x.find_element_by_xpath(selectors.get("temp"))
return temp.find_element_by_tag_name(tag)
except Exception:
return ""
def get_title_links(title):
l = title.find_elements_by_tag_name("a")
return l[-1].text, l[-1].get_attribute("href")
def get_title(x, selectors):
title = ""
try:
title = x.find_element_by_xpath(selectors.get("title"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc1"))
except Exception:
try:
title = x.find_element_by_xpath(selectors.get("title_exc2"))
except Exception:
pass
finally:
return title
def get_time(x):
time = ""
try:
time = x.find_element_by_tag_name("abbr").get_attribute("title")
time = (
str("%02d" % int(time.split(", ")[1].split()[1]),)
+ "-"
+ str(
(
"%02d"
% (
int(
(
list(calendar.month_abbr).index(
time.split(", ")[1].split()[0][:3]
)
)
),
)
)
)
+ "-"
+ time.split()[3]
+ " "
+ str("%02d" % int(time.split()[5].split(":")[0]))
+ ":"
+ str(time.split()[5].split(":")[1])
)
except Exception:
pass
finally:
return time