commit
4ed08fe3d2
27
README.md
27
README.md
|
|
@ -27,7 +27,7 @@
|
|||
<a href="https://www.github.com/harismuneer/Ultimate-Facebook-Scraper/fork">
|
||||
<img src="https://img.shields.io/github/forks/harismuneer/Ultimate-Facebook-Scraper.svg?style=social&label=Fork&maxAge=2592000" />
|
||||
</a>
|
||||
<a href="#">
|
||||
<a href="#">
|
||||
<img src="https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat&label=Contributions&colorA=red&colorB=black" />
|
||||
</a>
|
||||
</p>
|
||||
|
|
@ -176,17 +176,28 @@ The tool uses latest version of [Chrome Web Driver](http://chromedriver.chromium
|
|||
|
||||
Make sure the link only contains the username or id number at the end and not any other stuff. Make sure its in the format mentioned above.
|
||||
|
||||
> Note: There are two modes to download Friends Profile Pics and the user's Photos: Large Size and Small Size. You can change the following variables in [`scraper/scraper.py`](scraper/scraper.py#L30). By default they are set to Small Sized Pics because its really quick while Large Size Mode takes time depending on the number of pictures to download
|
||||
Run the `ultimate-facebook-scraper` command ! 🚀
|
||||
|
||||
```python
|
||||
# whether to download the full image or its thumbnail (small size)
|
||||
# if small size is True then it will be very quick else if its False then it will open each photo to download it
|
||||
# and it will take much more time
|
||||
friends_small_size = True
|
||||
photos_small_size = True
|
||||
$ python scraper/scraper.py
|
||||
```
|
||||
|
||||
Run the `ultimate-facebook-scraper` command ! 🚀
|
||||
> Note: There are two modes to download Friends Profile Pics and the user's Photos: Large Size and Small Size. By default they are set to Small Sized Pics because its really quick while Large Size Mode takes time depending on the number of pictures to download.
|
||||
|
||||
You can personalize your scrapping needs using the command line arguments:
|
||||
|
||||
```python
|
||||
$ python scraper/scraper.py \
|
||||
--uploaded_photos True \
|
||||
--friends_photos True \
|
||||
--friends_small_size True \
|
||||
--photos_small_size True \
|
||||
--total_scrolls 2500 \
|
||||
--scroll_time 8
|
||||
```
|
||||
|
||||
Note that those are the default values so no need to write them down if you're just testing or are okay with them.
|
||||
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,121 @@
|
|||
{
|
||||
"Friends": {
|
||||
"scan_list": [
|
||||
"All",
|
||||
"Mutual Friends",
|
||||
"Following",
|
||||
"Followers",
|
||||
"Work",
|
||||
"College",
|
||||
"Current City",
|
||||
"Hometown"
|
||||
],
|
||||
"section": [
|
||||
"/friends",
|
||||
"/friends_mutual",
|
||||
"/following",
|
||||
"/followers",
|
||||
"/friends_work",
|
||||
"/friends_college",
|
||||
"/friends_current_city",
|
||||
"/friends_hometown"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
|
||||
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
|
||||
],
|
||||
"file_names": [
|
||||
"All Friends.txt",
|
||||
"Mutual Friends.txt",
|
||||
"Following.txt",
|
||||
"Followers.txt",
|
||||
"Work Friends.txt",
|
||||
"College Friends.txt",
|
||||
"Current City Friends.txt",
|
||||
"Hometown Friends.txt"
|
||||
],
|
||||
"save_status":0
|
||||
},
|
||||
"Photos": {
|
||||
"scan_list": [
|
||||
"'s Photos",
|
||||
"Photos of"
|
||||
],
|
||||
"section": [
|
||||
"/photos_all",
|
||||
"/photos_of"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id, 'pic_')]",
|
||||
"//*[contains(@id, 'pic_')]"
|
||||
],
|
||||
"file_names": [
|
||||
"Uploaded Photos.txt",
|
||||
"Tagged Photos.txt"
|
||||
],
|
||||
"save_status": 1
|
||||
},
|
||||
"Videos": {
|
||||
"scan_list": [
|
||||
"'s Videos",
|
||||
"Videos of"
|
||||
],
|
||||
"section": [
|
||||
"/videos_by",
|
||||
"/videos_of"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
|
||||
],
|
||||
"file_names": [
|
||||
"Uploaded Videos.txt",
|
||||
"Tagged Videos.txt"
|
||||
],
|
||||
"save_status": 2
|
||||
},
|
||||
"About": {
|
||||
"scan_list": [],
|
||||
"section": [
|
||||
"/about?section=overview",
|
||||
"/about?section=education",
|
||||
"/about?section=living",
|
||||
"/about?section=contact-info",
|
||||
"/about?section=relationship",
|
||||
"/about?section=bio",
|
||||
"/about?section=year-overviews"
|
||||
],
|
||||
"elements_path": [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
|
||||
],
|
||||
"file_names": [
|
||||
"Overview.txt",
|
||||
"Work and Education.txt",
|
||||
"Places Lived.txt",
|
||||
"Contact and Basic Info.txt",
|
||||
"Family and Relationships.txt",
|
||||
"Details About.txt",
|
||||
"Life Events.txt"
|
||||
],
|
||||
"save_status": 3
|
||||
},
|
||||
"Posts": {
|
||||
"scan_list": [],
|
||||
"section": [],
|
||||
"elements_path": ["//div[@class='_5pcb _4b0l _2q8l']"],
|
||||
"file_names": ["Posts.txt"],
|
||||
"save_status": 4
|
||||
}
|
||||
}
|
||||
|
|
@ -5,6 +5,8 @@ import platform
|
|||
import sys
|
||||
import urllib.request
|
||||
import yaml
|
||||
import utils
|
||||
import argparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
|
|
@ -13,43 +15,6 @@ from selenium.webdriver.common.by import By
|
|||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
|
||||
# Global Variables
|
||||
|
||||
driver = None
|
||||
|
||||
# whether to download photos or not
|
||||
download_uploaded_photos = True
|
||||
download_friends_photos = True
|
||||
|
||||
# whether to download the full image or its thumbnail (small size)
|
||||
# if small size is True then it will be very quick else if its false then it will open each photo to download it
|
||||
# and it will take much more time
|
||||
friends_small_size = True
|
||||
photos_small_size = True
|
||||
|
||||
total_scrolls = 2500
|
||||
current_scrolls = 0
|
||||
scroll_time = 8
|
||||
|
||||
with open("selectors.json") as json_file:
|
||||
selectors = json.load(json_file)
|
||||
|
||||
old_height = 0
|
||||
firefox_profile_path = selectors.get("firefox_profile_path")
|
||||
facebook_https_prefix = selectors.get("facebook_https_prefix")
|
||||
facebook_link_body = selectors.get("facebook_link_body")
|
||||
|
||||
|
||||
CHROMEDRIVER_BINARIES_FOLDER = "bin"
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
|
||||
def get_facebook_images_url(img_links):
|
||||
urls = []
|
||||
|
|
@ -93,7 +58,7 @@ def image_downloader(img_links, folder_name):
|
|||
parent = os.getcwd()
|
||||
try:
|
||||
folder = os.path.join(os.getcwd(), folder_name)
|
||||
create_folder(folder)
|
||||
utils.create_folder(folder)
|
||||
os.chdir(folder)
|
||||
except Exception:
|
||||
print("Error in changing directory.")
|
||||
|
|
@ -126,120 +91,6 @@ def image_downloader(img_links, folder_name):
|
|||
# -------------------------------------------------------------
|
||||
|
||||
|
||||
def check_height():
|
||||
new_height = driver.execute_script(selectors.get("height_script"))
|
||||
return new_height != old_height
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
# helper function: used to scroll the page
|
||||
def scroll():
|
||||
global old_height
|
||||
current_scrolls = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
if current_scrolls == total_scrolls:
|
||||
return
|
||||
|
||||
old_height = driver.execute_script(selectors.get("height_script"))
|
||||
driver.execute_script(selectors.get("scroll_script"))
|
||||
WebDriverWait(driver, scroll_time, 0.05).until(
|
||||
lambda driver: check_height()
|
||||
)
|
||||
current_scrolls += 1
|
||||
except TimeoutException:
|
||||
break
|
||||
|
||||
return
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# -------------------------------------------------------------
|
||||
|
||||
# --Helper Functions for Posts
|
||||
|
||||
|
||||
def get_status(x):
|
||||
status = ""
|
||||
try:
|
||||
status = x.find_element_by_xpath(
|
||||
selectors.get("status")
|
||||
).text # use _1xnd for Pages
|
||||
except Exception:
|
||||
try:
|
||||
status = x.find_element_by_xpath(selectors.get("status_exc")).text
|
||||
except Exception:
|
||||
pass
|
||||
return status
|
||||
|
||||
|
||||
def get_div_links(x, tag):
|
||||
try:
|
||||
temp = x.find_element_by_xpath(selectors.get("temp"))
|
||||
return temp.find_element_by_tag_name(tag)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def get_title_links(title):
|
||||
l = title.find_elements_by_tag_name("a")
|
||||
return l[-1].text, l[-1].get_attribute("href")
|
||||
|
||||
|
||||
def get_title(x):
|
||||
title = ""
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc1"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc2"))
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
return title
|
||||
|
||||
|
||||
def get_time(x):
|
||||
time = ""
|
||||
try:
|
||||
time = x.find_element_by_tag_name("abbr").get_attribute("title")
|
||||
time = (
|
||||
str("%02d" % int(time.split(", ")[1].split()[1]),)
|
||||
+ "-"
|
||||
+ str(
|
||||
(
|
||||
"%02d"
|
||||
% (
|
||||
int(
|
||||
(
|
||||
list(calendar.month_abbr).index(
|
||||
time.split(", ")[1].split()[0][:3]
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
+ "-"
|
||||
+ time.split()[3]
|
||||
+ " "
|
||||
+ str("%02d" % int(time.split()[5].split(":")[0]))
|
||||
+ ":"
|
||||
+ str(time.split()[5].split(":")[1])
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
finally:
|
||||
return time
|
||||
|
||||
|
||||
def extract_and_write_posts(elements, filename):
|
||||
try:
|
||||
f = open(filename, "w", newline="\r\n")
|
||||
|
|
@ -257,58 +108,64 @@ def extract_and_write_posts(elements, filename):
|
|||
time = " "
|
||||
|
||||
# time
|
||||
time = get_time(x)
|
||||
time = utils.get_time(x)
|
||||
|
||||
# title
|
||||
title = get_title(x)
|
||||
title = utils.get_title(x, selectors)
|
||||
if title.text.find("shared a memory") != -1:
|
||||
x = x.find_element_by_xpath(selectors.get("title_element"))
|
||||
title = get_title(x)
|
||||
title = utils.get_title(x, selectors)
|
||||
|
||||
status = get_status(x)
|
||||
status = utils.get_status(x, selectors)
|
||||
if (
|
||||
title.text
|
||||
== driver.find_element_by_id(selectors.get("title_text")).text
|
||||
):
|
||||
if status == "":
|
||||
temp = get_div_links(x, "img")
|
||||
temp = utils.get_div_links(x, "img", selectors)
|
||||
if (
|
||||
temp == ""
|
||||
): # no image tag which means . it is not a life event
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||
"href"
|
||||
)
|
||||
type = "status update without text"
|
||||
else:
|
||||
type = "life event"
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
status = get_div_links(x, "a").text
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||
"href"
|
||||
)
|
||||
status = utils.get_div_links(x, "a", selectors).text
|
||||
else:
|
||||
type = "status update"
|
||||
if get_div_links(x, "a") != "":
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
if utils.get_div_links(x, "a", selectors) != "":
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute(
|
||||
"href"
|
||||
)
|
||||
|
||||
elif title.text.find(" shared ") != -1:
|
||||
|
||||
x1, link = get_title_links(title)
|
||||
x1, link = utils.get_title_links(title)
|
||||
type = "shared " + x1
|
||||
|
||||
elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
|
||||
if title.text.find(" at ") != -1:
|
||||
x1, link = get_title_links(title)
|
||||
x1, link = utils.get_title_links(title)
|
||||
type = "check in"
|
||||
elif title.text.find(" in ") != 1:
|
||||
status = get_div_links(x, "a").text
|
||||
status = utils.get_div_links(x, "a", selectors).text
|
||||
|
||||
elif (
|
||||
title.text.find(" added ") != -1 and title.text.find("photo") != -1
|
||||
):
|
||||
type = "added photo"
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
|
||||
|
||||
elif (
|
||||
title.text.find(" added ") != -1 and title.text.find("video") != -1
|
||||
):
|
||||
type = "added video"
|
||||
link = get_div_links(x, "a").get_attribute("href")
|
||||
link = utils.get_div_links(x, "a", selectors).get_attribute("href")
|
||||
|
||||
else:
|
||||
type = "others"
|
||||
|
|
@ -547,6 +404,7 @@ def save_to_file(name, elements, status, current_section):
|
|||
|
||||
|
||||
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
|
||||
|
||||
"""Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
|
||||
page = []
|
||||
|
||||
|
|
@ -572,7 +430,7 @@ def scrape_data(user_id, scan_list, section, elements_path, save_status, file_na
|
|||
continue
|
||||
|
||||
if save_status != 3:
|
||||
scroll()
|
||||
utils.scroll(total_scrolls, driver, selectors, scroll_time)
|
||||
|
||||
data = driver.find_elements_by_xpath(elements_path[i])
|
||||
|
||||
|
|
@ -619,16 +477,9 @@ def create_original_link(url):
|
|||
return original_link
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# -----------------------------------------------------------------------------
|
||||
def create_folder(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.mkdir(folder)
|
||||
|
||||
|
||||
def scrap_profile(ids):
|
||||
folder = os.path.join(os.getcwd(), "data")
|
||||
create_folder(folder)
|
||||
utils.create_folder(folder)
|
||||
os.chdir(folder)
|
||||
|
||||
# execute for all profiles given in input.txt file
|
||||
|
|
@ -642,137 +493,34 @@ def scrap_profile(ids):
|
|||
|
||||
try:
|
||||
target_dir = os.path.join(folder, user_id.split("/")[-1])
|
||||
create_folder(target_dir)
|
||||
utils.create_folder(target_dir)
|
||||
os.chdir(target_dir)
|
||||
except Exception:
|
||||
print("Some error occurred in creating the profile directory.")
|
||||
continue
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
print("----------------------------------------")
|
||||
print("Friends..")
|
||||
# setting parameters for scrape_data() to scrape friends
|
||||
scan_list = [
|
||||
"All",
|
||||
"Mutual Friends",
|
||||
"Following",
|
||||
"Followers",
|
||||
"Work",
|
||||
"College",
|
||||
"Current City",
|
||||
"Hometown",
|
||||
]
|
||||
section = [
|
||||
"/friends",
|
||||
"/friends_mutual",
|
||||
"/following",
|
||||
"/followers",
|
||||
"/friends_work",
|
||||
"/friends_college",
|
||||
"/friends_current_city",
|
||||
"/friends_hometown",
|
||||
]
|
||||
elements_path = [
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
|
||||
"//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
"//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
|
||||
]
|
||||
file_names = [
|
||||
"All Friends.txt",
|
||||
"Mutual Friends.txt",
|
||||
"Following.txt",
|
||||
"Followers.txt",
|
||||
"Work Friends.txt",
|
||||
"College Friends.txt",
|
||||
"Current City Friends.txt",
|
||||
"Hometown Friends.txt",
|
||||
]
|
||||
save_status = 0
|
||||
to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
|
||||
for item in to_scrap:
|
||||
print("----------------------------------------")
|
||||
print("Scraping {}..".format(item))
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Friends Done!")
|
||||
if item == "Posts":
|
||||
scan_list = [None]
|
||||
elif item == "About":
|
||||
scan_list = [None] * 7
|
||||
else:
|
||||
scan_list = params[item]["scan_list"]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
section = params[item]["section"]
|
||||
elements_path = params[item]["elements_path"]
|
||||
file_names = params[item]["file_names"]
|
||||
save_status = params[item]["save_status"]
|
||||
|
||||
print("----------------------------------------")
|
||||
print("Photos..")
|
||||
print("Scraping Links..")
|
||||
# setting parameters for scrape_data() to scrap photos
|
||||
scan_list = ["'s Photos", "Photos of"]
|
||||
section = ["/photos_all", "/photos_of"]
|
||||
elements_path = ["//*[contains(@id, 'pic_')]"] * 2
|
||||
file_names = ["Uploaded Photos.txt", "Tagged Photos.txt"]
|
||||
save_status = 1
|
||||
scrape_data(
|
||||
user_id, scan_list, section, elements_path, save_status, file_names
|
||||
)
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Photos Done!")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
print("----------------------------------------")
|
||||
print("Videos:")
|
||||
# setting parameters for scrape_data() to scrap videos
|
||||
scan_list = ["'s Videos", "Videos of"]
|
||||
section = ["/videos_by", "/videos_of"]
|
||||
elements_path = [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
|
||||
] * 2
|
||||
file_names = ["Uploaded Videos.txt", "Tagged Videos.txt"]
|
||||
save_status = 2
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Videos Done!")
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
print("----------------------------------------")
|
||||
print("About:")
|
||||
# setting parameters for scrape_data() to scrap the about section
|
||||
scan_list = [None] * 7
|
||||
section = [
|
||||
"/about?section=overview",
|
||||
"/about?section=education",
|
||||
"/about?section=living",
|
||||
"/about?section=contact-info",
|
||||
"/about?section=relationship",
|
||||
"/about?section=bio",
|
||||
"/about?section=year-overviews",
|
||||
]
|
||||
elements_path = [
|
||||
"//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
|
||||
] * 7
|
||||
file_names = [
|
||||
"Overview.txt",
|
||||
"Work and Education.txt",
|
||||
"Places Lived.txt",
|
||||
"Contact and Basic Info.txt",
|
||||
"Family and Relationships.txt",
|
||||
"Details About.txt",
|
||||
"Life Events.txt",
|
||||
]
|
||||
save_status = 3
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("About Section Done!")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
print("----------------------------------------")
|
||||
print("Posts:")
|
||||
# setting parameters for scrape_data() to scrap posts
|
||||
scan_list = [None]
|
||||
section = []
|
||||
elements_path = ['//div[@class="_5pcb _4b0l _2q8l"]']
|
||||
|
||||
file_names = ["Posts.txt"]
|
||||
save_status = 4
|
||||
|
||||
scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
|
||||
print("Posts(Statuses) Done!")
|
||||
print("----------------------------------------")
|
||||
# ----------------------------------------------------------------------------
|
||||
print("{} Done!".format(item))
|
||||
|
||||
print("\nProcess Completed.")
|
||||
os.chdir("../..")
|
||||
|
|
@ -901,5 +649,73 @@ def scraper(**kwargs):
|
|||
# -------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
# PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
|
||||
ap.add_argument(
|
||||
"-dup",
|
||||
"--uploaded_photos",
|
||||
help="download users' uploaded photos?",
|
||||
default=True,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-dfp", "--friends_photos", help="download users' photos?", default=True
|
||||
)
|
||||
ap.add_argument(
|
||||
"-fss",
|
||||
"--friends_small_size",
|
||||
help="Download friends pictures in small size?",
|
||||
default=True,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-pss",
|
||||
"--photos_small_size",
|
||||
help="Download photos in small size?",
|
||||
default=True,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-ts",
|
||||
"--total_scrolls",
|
||||
help="How many times should I scroll down?",
|
||||
default=2500,
|
||||
)
|
||||
ap.add_argument(
|
||||
"-st", "--scroll_time", help="How much time should I take to scroll?", default=8
|
||||
)
|
||||
|
||||
args = vars(ap.parse_args())
|
||||
print(args)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Global Variables
|
||||
# ---------------------------------------------------------
|
||||
|
||||
# whether to download photos or not
|
||||
download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
|
||||
download_friends_photos = utils.to_bool(args["friends_photos"])
|
||||
|
||||
# whether to download the full image or its thumbnail (small size)
|
||||
# if small size is True then it will be very quick else if its false then it will open each photo to download it
|
||||
# and it will take much more time
|
||||
friends_small_size = utils.to_bool(args["friends_small_size"])
|
||||
photos_small_size = utils.to_bool(args["photos_small_size"])
|
||||
|
||||
total_scrolls = int(args["total_scrolls"])
|
||||
scroll_time = int(args["scroll_time"])
|
||||
|
||||
current_scrolls = 0
|
||||
old_height = 0
|
||||
|
||||
driver = None
|
||||
CHROMEDRIVER_BINARIES_FOLDER = "bin"
|
||||
|
||||
with open("selectors.json") as a, open("params.json") as b:
|
||||
selectors = json.load(a)
|
||||
params = json.load(b)
|
||||
|
||||
firefox_profile_path = selectors.get("firefox_profile_path")
|
||||
facebook_https_prefix = selectors.get("facebook_https_prefix")
|
||||
facebook_link_body = selectors.get("facebook_link_body")
|
||||
|
||||
# get things rolling
|
||||
scraper()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,139 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
def to_bool(x):
|
||||
if x in ["False", "0", 0, False]:
|
||||
return False
|
||||
elif x in ["True", "1", 1, True]:
|
||||
return True
|
||||
else:
|
||||
raise argparse.ArgumentTypeError("Boolean value expected")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
def create_folder(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.mkdir(folder)
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Helper functions for Page scrolling
|
||||
# -------------------------------------------------------------
|
||||
# check if height changed
|
||||
def check_height(driver, selectors, old_height):
|
||||
new_height = driver.execute_script(selectors.get("height_script"))
|
||||
return new_height != old_height
|
||||
|
||||
|
||||
# helper function: used to scroll the page
|
||||
def scroll(total_scrolls, driver, selectors, scroll_time):
|
||||
global old_height
|
||||
current_scrolls = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
if current_scrolls == total_scrolls:
|
||||
return
|
||||
|
||||
old_height = driver.execute_script(selectors.get("height_script"))
|
||||
driver.execute_script(selectors.get("scroll_script"))
|
||||
WebDriverWait(driver, scroll_time, 0.05).until(
|
||||
lambda driver: check_height(driver, selectors, old_height)
|
||||
)
|
||||
current_scrolls += 1
|
||||
except TimeoutException:
|
||||
break
|
||||
|
||||
return
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper Functions for Posts
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_status(x, selectors):
|
||||
status = ""
|
||||
try:
|
||||
status = x.find_element_by_xpath(
|
||||
selectors.get("status")
|
||||
).text # use _1xnd for Pages
|
||||
except Exception:
|
||||
try:
|
||||
status = x.find_element_by_xpath(selectors.get("status_exc")).text
|
||||
except Exception:
|
||||
pass
|
||||
return status
|
||||
|
||||
|
||||
def get_div_links(x, tag, selectors):
|
||||
try:
|
||||
temp = x.find_element_by_xpath(selectors.get("temp"))
|
||||
return temp.find_element_by_tag_name(tag)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def get_title_links(title):
|
||||
l = title.find_elements_by_tag_name("a")
|
||||
return l[-1].text, l[-1].get_attribute("href")
|
||||
|
||||
|
||||
def get_title(x, selectors):
|
||||
title = ""
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc1"))
|
||||
except Exception:
|
||||
try:
|
||||
title = x.find_element_by_xpath(selectors.get("title_exc2"))
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
return title
|
||||
|
||||
|
||||
def get_time(x):
|
||||
time = ""
|
||||
try:
|
||||
time = x.find_element_by_tag_name("abbr").get_attribute("title")
|
||||
time = (
|
||||
str("%02d" % int(time.split(", ")[1].split()[1]),)
|
||||
+ "-"
|
||||
+ str(
|
||||
(
|
||||
"%02d"
|
||||
% (
|
||||
int(
|
||||
(
|
||||
list(calendar.month_abbr).index(
|
||||
time.split(", ")[1].split()[0][:3]
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
+ "-"
|
||||
+ time.split()[3]
|
||||
+ " "
|
||||
+ str("%02d" % int(time.split()[5].split(":")[0]))
|
||||
+ ":"
|
||||
+ str(time.split()[5].split(":")[1])
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
finally:
|
||||
return time
|
||||
Loading…
Reference in New Issue