pythonのbeautifulsoup4を利用してサイト内にあるすべてのリンクを抽出してみる

はじめに

運用しているサイトでバックアップファイルが多く残っているサイトで、現時点でトップページから正しくリンクが張られているページを調査するため、手動でチェックするのは、しんどいのでプログラムで調査できないか、サンプルのプログラムを作成してみました。

利用したパッケージ

beautifulsoup4
requests

サンプルコード

from html.parser import HTMLParser
from bs4 import BeautifulSoup
import requests

baseurl = "<調査したいURL>";
urlList = []
ok_urlList = []

def searchLink(url):
    global urlList
    global ok_urlList
    if url == "":
        response = requests.get(f"{baseurl}index.php")
    else:
        response = requests.get(f"{url}")

    html = BeautifulSoup(response.text, 'html.parser')
    for link in html.findAll("a"):
        href = link.get('href')
        if href is None:
            pass
        else:
　　　 # hrefがないAタグや、index.htmlがある場合は省く
            if ("http" not in href and "index.html" not in href):
                href = href.replace('../', '')
                href = href.replace('./', '')
                href = href.replace('//', '')
　　　　 # 拡張子がphpとhtmlファイルのURLだけ取得する
                if (".php" in href or ".html" in href ):
                    if href[-1] == "/":
                        urlList.append(f"{baseurl}{href}index.php")
                    else:
                        if href[0] == "/":
                            urlList.append(f"{baseurl}{href[1:]}")
                        else:
                            urlList.append(f"{baseurl}{href}")

　 # 配列に格納されている値をユニークにする
    urlList = list(dict.fromkeys(urlList))
　# チェック済みのURLは省く
    result = list(set(urlList) - set(ok_urlList))
    if result:
        ok_urlList.append(f"{result[0]}")
        # まだチェックしていないURLを指定する
        searchLink(result[0])
    else:
　　 # チェックするURLがなくなったら処理を終了する
        return

searchLink("")
print(ok_urlList)