Skip to content

Scraping TOR/Onion sites with Python !! Easy, quick and efficient – HOXFRAMEWORK

Posted in VIDEOS

Hello everyone and welcome back!

Today we’ll be scraping some deep web sites with Python using these scripts – but please watch the video too to make sure you are doing everything right and staying safe out there ! Also, dont scrape ilegally, be kind to website owners.

Requirements:
pysocks
requests

Here are all 3 files seperated by the title ( -- title --) : 

 -- AHMIA SCRAPER --

def Scraper():
    import requests
    import random
    
    yourquery = "Credit card"
    #yourquery = "Croatia Index Of"

    if " " in yourquery:
        yourquery = yourquery.replace(" ","+")

    url = "https://ahmia.fi/search/?q={}".format(yourquery)
    #print(url)

    #lets set up some fake user agents
    ua_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19577"
    ,"Mozilla/5.0 (X11) AppleWebKit/62.41 (KHTML, like Gecko) Edge/17.10859 Safari/452.6", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2656.18 Safari/537.36"
    ,"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36", "Mozilla/5.0 (Linux; U; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13","Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
    ,"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"]
    ua = random.choice(ua_list)
    headers = {'User-Agent': ua}
    #this should work



    request = requests.get(url, headers=headers) #, verify=False)
    content = request.text

    def findlinks(content):
        #takes in content - webpage in string format - then searches it with regex
        import re
        import random #just for generating lists of sites -files easily
        
        regexquery = "\w+\.onion"
        #regexquery is a regex query for finding onion links
        mineddata = re.findall(regexquery, content)

        n = random.randint(1,9999)
        
        filename = "sites{}.txt".format(str(n))
        print("Saving to ... ", filename)
        mineddata = list(dict.fromkeys(mineddata))
        
        with open(filename,"w+") as _:
            print("")
        for k in mineddata:
            with open(filename,"a") as newfile:
                k  = k + "\n"
                newfile.write(k)
        print("All the files written to a text file : ", filename)



    if request.status_code == 200:
        print("Request went through. \n")
        #print(content)
        findlinks(content)

    #now we COULD use something that reads HTML well
    #like BeautifulSoup - but we could also do something
    #way easier and use RegEx


-- TOR Searcher -- 



def torSearcher(url):
    # BEFORE YOU START - RUN tor.exe !!!!
    
    import requests
    import random
    def get_tor_session():
        session = requests.session()
        # Tor uses the 9050 port as the default socks port
        session.proxies = {'http':  'socks5h://127.0.0.1:9050',
                           'https': 'socks5h://127.0.0.1:9050'}
        return session

    # Make a request through the Tor connection
    # IP visible through Tor
    session = get_tor_session()
    #url = "http://httpbin.org/ip"
    #url = "http://x.onion/"


    #ua_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19577"
    #,"Mozilla/5.0 (X11) AppleWebKit/62.41 (KHTML, like Gecko) Edge/17.10859 Safari/452.6", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2656.18 Safari/537.36"
    #,"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36", "Mozilla/5.0 (Linux; U; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13","Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
    #,"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"]
    #ua = random.choice(ua_list)
    #headers = {'User-Agent': ua}
    print("Getting ...", url)
    result = session.get(url).text
    # Above should print an IP different than your public IP
    # Following prints your normal public IP
    #print(requests.get("http://httpbin.org/ip").text)

    #>>> url = "http://x.onion/"
    #>>> url = url.replace("http://","")
    #>>> url = url.replace(".onion","")
    #>>> url = url.replace("/","")
    # or  u can use the name
    
    import string
    filename = ''.join(random.choice(string.ascii_lowercase) for i in range(16))
    with open(f"{filename}.txt","w+", encoding="utf-8") as newthing:
        newthing.write(result)

#url = "http://x.onion"
#torSearcher(url)

import sys
import os
programname = os.path.basename(sys.argv[0])

try:
    thelist = sys.argv[1]
    print("Opening ...", thelist)
    with open(thelist, "r", encoding="utf-8") as newfile:
        data = newfile.readlines()
        try:
            #
            for k in data:
                k = k.replace("\n","")
                k = "http://" + k
                torSearcher(k)
        except Exception as E:
            print(E)
except:
    print("Usage : {} <newlineSeperatedList.txt>".format(programname))

"""


-- RunScraper -- 

#import the actual scraper
import requests

url = "http://ip-api.com/json/"
key = requests.get(url)
#print(key.text)
if "Croatia" in key.text or "Zagreb" in key.text or "Hrvatska" in key.text:
    print("Your VPN might not be on !!")
    safe = False
else:
    safe = True

if safe == True:
    import ahmiascraper
    ahmiascraper.Scraper()
else:
    print("IP change failed, try again later.")








And there we go. Thanks for visiting ! I wish you to Have a nice day! 🙂