引言

突发奇想,想搞一个代码抓取一下一个网站所有的链接信息,帮助SEO优化

前置安装

下载python和更新pip

python3

python3

updatePip

updatePip

代码

import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
import time
import socket
from urllib.robotparser import RobotFileParser

# 目标域名
target_domain = 'https://a.com'
# 提取一级域名
parsed_target = urlparse(target_domain)
target_main_domain = '.'.join(parsed_target.netloc.split('.')[-2:])

# 已访问的页面
visited_pages = set()
# 待爬取的页面
pages_to_visit = []
# 爬取速度(毫秒)
crawl_delay = 1000
# 爬取页面数量
crawled_count = 0
# 初始化 RobotFileParser
rp = RobotFileParser()
rp.set_url(urljoin(target_domain, '/robots.txt'))
rp.read()


def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def is_same_main_domain(url):
    parsed = urlparse(url)
    main_domain = '.'.join(parsed.netloc.split('.')[-2:])
    return main_domain == target_main_domain


def get_initial_links():
    global pages_to_visit
    try:
        if rp.can_fetch('*', target_domain):
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(target_domain, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a'):
                    href = link.get('href')
                    if href:
                        full_url = urljoin(target_domain, href)
                        if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url):
                            pages_to_visit.append(full_url)
    except Exception as e:
        print(f"Error getting initial links from {target_domain}: {e}")


def crawl_page(url, source_page):
    global visited_pages, pages_to_visit, crawled_count
    if url in visited_pages:
        return
    visited_pages.add(url)
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        start_time = time.time()
        response = requests.get(url, headers=headers)
        end_time = time.time()
        status_code = response.status_code
        status_reason = response.reason
        content_type = response.headers.get('Content-Type', '')
        response_time = int((end_time - start_time) * 1000)
        try:
            ip = socket.gethostbyname(urlparse(url).netloc)
        except socket.gaierror:
            ip = 'Unknown'
        # 写入数据
        with open('crawl_results.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([url, source_page, status_code, status_reason, ip, content_type, response_time])
        crawled_count += 1
        print(f"已爬取 {crawled_count} 个页面: {url}")
        if status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a'):
                href = link.get('href')
                if href:
                    full_url = urljoin(url, href)
                    if is_valid_url(full_url) and is_same_main_domain(full_url) and rp.can_fetch('*', full_url) and full_url not in visited_pages:
                        pages_to_visit.append(full_url)
        time.sleep(crawl_delay / 1000)
    except Exception as e:
        print(f"Error crawling {url}: {e}")


# 获取初始待爬取链接
get_initial_links()

# 初始化 CSV 文件表头
with open('crawl_results.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Page URL', 'Source Page', 'Status Code', 'Status Reason', 'IP Address', 'Content Type', 'Response Time (ms)'])

while pages_to_visit:
    current_page = pages_to_visit.pop(0)
    crawl_page(current_page, target_domain if current_page == target_domain else current_page)

print("Crawling completed. Results saved to crawl_results.csv")

欢迎关注拓行公众号,分享各种技术博客文章

拓行——奋勇进取,开拓未来,砥砺前行

最后修改:2025 年 05 月 08 日
如果您对各种技术博客文章感兴趣,欢迎关注拓行公众号,分享各种专业技术知识~