## 写在前面 突发奇想,想搞一个代码抓取一下一个网站所有的链接信息,帮助SEO优化 ## 前置安装 下载python和更新pip   ## 代码 ```python import requests from bs4 import BeautifulSoup import csv from urllib.parse import urljoin, urlparse import time import socket from urllib.robotparser import RobotFileParser # 目标域名 target_domain = 'https://a.com' # 提取一级域名 parsed_target = urlparse(target_domain) target_main_domain = '.'.join(parsed_target.netloc.split('.')[-2:]) # 已访问的页面 visited_pages = set() # 待爬取的页面 pages_to_visit = [] # 爬取速度(毫秒) crawl_delay = 1000 # 爬取页面数量 crawled_count = 0 # 初始化 RobotFileParser rp = RobotFileParser() rp.set_url(urljoin(target_domain, '/robots.txt')) rp.read() def is_valid_url(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def is_same_main_domain(url): parsed = urlparse(url) main_domain = '.'.join(parsed.netloc.split('.')[-2:]) return main_domain == target_main_domain def get_initial_links(): global pages_to_visit try: if rp.can_fetch('*', target_domain): headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(target_domain, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') for link in soup.find_all('a'): href = link.get('href') if href: full_url = urljoin(target_domain, href) if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url): pages_to_visit.append(full_url) except Exception as e: print(f"Error getting initial links from {target_domain}: {e}") def crawl_page(url, source_page): global visited_pages, pages_to_visit, crawled_count if url in visited_pages: return visited_pages.add(url) try: headers = {'User-Agent': 'Mozilla/5.0'} start_time = time.time() response = requests.get(url, headers=headers) end_time = time.time() status_code = response.status_code status_reason = response.reason content_type = response.headers.get('Content-Type', '') response_time = int((end_time - start_time) * 1000) try: ip = socket.gethostbyname(urlparse(url).netloc) except socket.gaierror: ip = 'Unknown' # 写入数据 with open('crawl_results.csv', mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([url, source_page, status_code, status_reason, ip, content_type, response_time]) crawled_count += 1 print(f"已爬取 {crawled_count} 个页面: {url}") if status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') for link in soup.find_all('a'): href = link.get('href') if href: full_url = urljoin(url, href) if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url) and full_url not in visited_pages: pages_to_visit.append(full_url) time.sleep(crawl_delay / 1000) except Exception as e: print(f"Error crawling {url}: {e}") # 获取初始待爬取链接 get_initial_links() # 初始化 CSV 文件表头 with open('crawl_results.csv', mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['Page URL', 'Source Page', 'Status Code', 'Status Reason', 'IP Address', 'Content Type', 'Response Time (ms)']) while pages_to_visit: current_page = pages_to_visit.pop(0) crawl_page(current_page, target_domain if current_page == target_domain else current_page) print("Crawling completed. Results saved to crawl_results.csv") ``` Loading... ## 写在前面 突发奇想,想搞一个代码抓取一下一个网站所有的链接信息,帮助SEO优化 ## 前置安装 下载python和更新pip   ## 代码 ```python import requests from bs4 import BeautifulSoup import csv from urllib.parse import urljoin, urlparse import time import socket from urllib.robotparser import RobotFileParser # 目标域名 target_domain = 'https://a.com' # 提取一级域名 parsed_target = urlparse(target_domain) target_main_domain = '.'.join(parsed_target.netloc.split('.')[-2:]) # 已访问的页面 visited_pages = set() # 待爬取的页面 pages_to_visit = [] # 爬取速度(毫秒) crawl_delay = 1000 # 爬取页面数量 crawled_count = 0 # 初始化 RobotFileParser rp = RobotFileParser() rp.set_url(urljoin(target_domain, '/robots.txt')) rp.read() def is_valid_url(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def is_same_main_domain(url): parsed = urlparse(url) main_domain = '.'.join(parsed.netloc.split('.')[-2:]) return main_domain == target_main_domain def get_initial_links(): global pages_to_visit try: if rp.can_fetch('*', target_domain): headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(target_domain, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') for link in soup.find_all('a'): href = link.get('href') if href: full_url = urljoin(target_domain, href) if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url): pages_to_visit.append(full_url) except Exception as e: print(f"Error getting initial links from {target_domain}: {e}") def crawl_page(url, source_page): global visited_pages, pages_to_visit, crawled_count if url in visited_pages: return visited_pages.add(url) try: headers = {'User-Agent': 'Mozilla/5.0'} start_time = time.time() response = requests.get(url, headers=headers) end_time = time.time() status_code = response.status_code status_reason = response.reason content_type = response.headers.get('Content-Type', '') response_time = int((end_time - start_time) * 1000) try: ip = socket.gethostbyname(urlparse(url).netloc) except socket.gaierror: ip = 'Unknown' # 写入数据 with open('crawl_results.csv', mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([url, source_page, status_code, status_reason, ip, content_type, response_time]) crawled_count += 1 print(f"已爬取 {crawled_count} 个页面: {url}") if status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') for link in soup.find_all('a'): href = link.get('href') if href: full_url = urljoin(url, href) if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url) and full_url not in visited_pages: pages_to_visit.append(full_url) time.sleep(crawl_delay / 1000) except Exception as e: print(f"Error crawling {url}: {e}") # 获取初始待爬取链接 get_initial_links() # 初始化 CSV 文件表头 with open('crawl_results.csv', mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['Page URL', 'Source Page', 'Status Code', 'Status Reason', 'IP Address', 'Content Type', 'Response Time (ms)']) while pages_to_visit: current_page = pages_to_visit.pop(0) crawl_page(current_page, target_domain if current_page == target_domain else current_page) print("Crawling completed. Results saved to crawl_results.csv") ``` 最后修改:2025 年 06 月 30 日 © 允许规范转载 打赏 赞赏作者 支付宝微信 赞 如果您对各种技术博客文章感兴趣,欢迎关注拓行公众号,分享各种专业技术知识~