import sys # Importing the sys module for system-specific parameters and functions from threading import Thread # Importing the Thread class from the threading module import time # Importing the time module for time-related functions import urllib.request # Importing the urllib.request module for making HTTP requests from urllib.parse import urlparse # Importing the urlparse function from urllib.parse module for URL parsing import os # Importing the os module for operating system-related functions import re # Importing the re module for regular expressions class get_webpage: def __init__(self, user_agent): self.agent = user_agent self.construct_header() def construct_header(self): self.header = {} self.header['User-Agent'] = self.agent print("\nUser-agent used: " + str(self.header) + "\n") # Printing the user-agent being used def make_request(self, webpage): req = urllib.request.Request(webpage, headers=self.header) # Creating a request object with the provided headers html = urllib.request.urlopen(req).read() # Sending the request and reading the response open(self.filename_to_write(webpage), "w").write(html.decode()) # Writing the HTML content to a file print("Webpage saved to: " + str(os.getcwd()) + "\\" + self.filename_to_write(webpage)) # Printing the path where the webpage is saved def filename_to_write(self, webpage): filename = urlparse(webpage)[1] # Extracting the domain name from the webpage URL filename = re.sub('[^a-zA-Z0-9]', '', filename) # Removing non-alphanumeric characters from the filename return filename + ".txt" # Appending ".txt" extension to the filename class load_site_list(): def __init__(self): self.websitelist = None def get_website_list(self): try: file = input("Enter filename of website addresses: ") # Prompting the user to enter the filename of website addresses self.websitelist = open(file, "r").read().split("\n") # Reading the file and storing the website addresses in a list except FileNotFoundError: print("*** file not found ***") # Printing an error message if the file is not found finally: pass def main(): f = load_site_list() # Creating an instance of the load_site_list class f.get_website_list() # Calling the get_website_list method to get the website addresses req = get_webpage("Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:48.0) Gecko/20100101 Firefox/48.0") # Creating an instance of the get_webpage class with a user-agent for i in range(0, len(f.websitelist)): print("Starting thread to download: " + f.websitelist[i]) # Printing the website address for which a thread is being started try: t = Thread(target=req.make_request, args=(f.websitelist[i],)) # Creating a new thread to download the webpage t.start() # Starting the thread t.join() # Waiting for the thread to complete its execution except Exception as err: print("Error creating threads: " + str(err)) # Printing an error message if there is an exception while creating threads finally: pass if __name__ == '__main__': start = time.time() # Recording the start time main() # Calling the main function end = time.time() # Recording the end time #print("\nPage written to " + str(os.getcwd()) + "\page.txt") print("Time taken: {:.6f} seconds".format(end - start)) # Printing the time taken to complete the execution
Copyright © 2023 - slash-root.com