Skip to content

Commit 0e633d9

Browse files
committed
Draft script to find divergin links
See Quansight-Labs/czi-scientific-python-mgmt#88 Incomplete, in particular we should handle relative and anchor linsks, starting with #, and . $ python tools/divergent_links.py docs/_build/html
1 parent 15494ec commit 0e633d9

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

tools/divergent_links.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""This script help checking divergent links.
2+
3+
That is to say, links to the same page,
4+
that have different titles.
5+
"""
6+
7+
import os
8+
import sys
9+
from collections import defaultdict
10+
11+
from bs4 import BeautifulSoup
12+
13+
ignores = ["#", "next", "previous"]
14+
15+
16+
def find_html_files(folder_path):
17+
"""Find all html files in given folder."""
18+
html_files = []
19+
for root, dirs, files in os.walk(folder_path):
20+
for file in files:
21+
if file.endswith(".html"):
22+
html_files.append(os.path.join(root, file))
23+
return html_files
24+
25+
26+
class Checker:
27+
"""Link checker."""
28+
29+
links: dict[str, list]
30+
31+
def __init__(self):
32+
self.links = defaultdict(list)
33+
34+
def scan(self, html_content, identifier):
35+
"""Scan given file for html links."""
36+
# Parse the HTML content using BeautifulSoup
37+
soup = BeautifulSoup(html_content, "html.parser")
38+
39+
# Dictionary to store URLs and their corresponding titles
40+
41+
# Extract all anchor tags
42+
for a_tag in soup.find_all("a", href=True):
43+
url = a_tag["href"]
44+
if url.startswith("#"):
45+
continue
46+
content = a_tag.text.strip().lower()
47+
if content in ignores:
48+
continue
49+
if content.split("\n")[0] in ignores:
50+
continue
51+
52+
self.links[content].append((url, identifier))
53+
54+
def duplicates(self):
55+
"""Print potential duplicates."""
56+
for content, url_pages in self.links.items():
57+
uniq_url = {u for u, _ in url_pages}
58+
if len(uniq_url) >= 2:
59+
print(f"{content} has divergent url:")
60+
for u, p in url_pages:
61+
print(" ", u, "in", p)
62+
63+
64+
# Example usage
65+
data = """
66+
<html>
67+
<body>
68+
<a href="https://example.com" title="Example Site">Visit Example</a>
69+
<a href="https://example.com" title="Example Website">Check Example</a>
70+
<a href="https://openai.com" title="OpenAI">Visit OpenAI</a>
71+
<a href="https://openai.com" title="OpenAI">Learn about OpenAI</a>
72+
</body>
73+
</html>
74+
"""
75+
76+
c = Checker()
77+
# Call the function and print results
78+
# inconsistencies = c.scan(data, "C0")
79+
80+
print(sys.argv)
81+
82+
for file in find_html_files(sys.argv[1]):
83+
with open(file) as f:
84+
data = f.read()
85+
c.scan(data, file)
86+
87+
c.duplicates()

0 commit comments

Comments
 (0)