引言
突发奇想,想搞一个代码抓取一下一个网站所有的链接信息,帮助SEO优化
前置安装
下载python和更新pip
代码
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
import time
import socket
from urllib.robotparser import RobotFileParser
# 目标域名
target_domain = 'https://a.com'
# 提取一级域名
parsed_target = urlparse(target_domain)
target_main_domain = '.'.join(parsed_target.netloc.split('.')[-2:])
# 已访问的页面
visited_pages = set()
# 待爬取的页面
pages_to_visit = []
# 爬取速度(毫秒)
crawl_delay = 1000
# 爬取页面数量
crawled_count = 0
# 初始化 RobotFileParser
rp = RobotFileParser()
rp.set_url(urljoin(target_domain, '/robots.txt'))
rp.read()
def is_valid_url(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def is_same_main_domain(url):
parsed = urlparse(url)
main_domain = '.'.join(parsed.netloc.split('.')[-2:])
return main_domain == target_main_domain
def get_initial_links():
global pages_to_visit
try:
if rp.can_fetch('*', target_domain):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(target_domain, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(target_domain, href)
if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url):
pages_to_visit.append(full_url)
except Exception as e:
print(f"Error getting initial links from {target_domain}: {e}")
def crawl_page(url, source_page):
global visited_pages, pages_to_visit, crawled_count
if url in visited_pages:
return
visited_pages.add(url)
try:
headers = {'User-Agent': 'Mozilla/5.0'}
start_time = time.time()
response = requests.get(url, headers=headers)
end_time = time.time()
status_code = response.status_code
status_reason = response.reason
content_type = response.headers.get('Content-Type', '')
response_time = int((end_time - start_time) * 1000)
try:
ip = socket.gethostbyname(urlparse(url).netloc)
except socket.gaierror:
ip = 'Unknown'
# 写入数据
with open('crawl_results.csv', mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow([url, source_page, status_code, status_reason, ip, content_type, response_time])
crawled_count += 1
print(f"已爬取 {crawled_count} 个页面: {url}")
if status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(url, href)
if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url) and full_url not in visited_pages:
pages_to_visit.append(full_url)
time.sleep(crawl_delay / 1000)
except Exception as e:
print(f"Error crawling {url}: {e}")
# 获取初始待爬取链接
get_initial_links()
# 初始化 CSV 文件表头
with open('crawl_results.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Page URL', 'Source Page', 'Status Code', 'Status Reason', 'IP Address', 'Content Type', 'Response Time (ms)'])
while pages_to_visit:
current_page = pages_to_visit.pop(0)
crawl_page(current_page, target_domain if current_page == target_domain else current_page)
print("Crawling completed. Results saved to crawl_results.csv")