Fork of https://github.com/oxigraph/oxigraph.git for the purpose of NextGraph project
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
2.0 KiB
57 lines
2.0 KiB
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from urllib.error import HTTPError
|
|
from urllib.parse import urlsplit, urlunsplit
|
|
from urllib.request import urlopen
|
|
|
|
LINK_REGEXES = {
|
|
r"\[[^]]+]\((https?://(w3c.github.io|www.w3.org)/[^)]+)\)", # Markdown
|
|
r"<(https?://(w3c.github.io|www.w3.org)/[^>]+)>`_", # reStructuredText
|
|
}
|
|
|
|
base_path = Path(__file__).parent.parent
|
|
spec_cache = {}
|
|
errors = set()
|
|
|
|
for ext in ("md", "rs", "rst"):
|
|
for file in base_path.rglob(f"*.{ext}"):
|
|
content = file.read_text()
|
|
for link_regex in LINK_REGEXES:
|
|
for m in re.finditer(link_regex, content):
|
|
url = m.group(1)
|
|
(scheme, host, path, query, fragment) = urlsplit(url)
|
|
if scheme != "https":
|
|
errors.add(f"HTTP URL used by {url} in {file}")
|
|
if query != "":
|
|
errors.add(f"URL query used by {url} in {file}")
|
|
if path.endswith(".html/"):
|
|
errors.add(f".html/ used by {url} in {file}")
|
|
base_url = urlunsplit(("https", host, path.rstrip("/"), "", ""))
|
|
if base_url not in spec_cache:
|
|
try:
|
|
with urlopen(base_url) as response:
|
|
spec_cache[base_url] = response.read().decode()
|
|
except HTTPError as e:
|
|
errors.add(
|
|
f"Fetching {url} used in {file} return HTTP error: {e}"
|
|
)
|
|
spec_content = spec_cache.get(base_url, "")
|
|
if (
|
|
fragment != ""
|
|
and re.search(rf"[iI][dD]\s*=\s*['\"]{fragment}['\"]", spec_content)
|
|
is None
|
|
):
|
|
errors.add(
|
|
f"Fragment {fragment} of {url} used in {file} does not exist"
|
|
)
|
|
|
|
print("Used specs:")
|
|
for url in sorted(spec_cache.keys()):
|
|
print(url)
|
|
|
|
if errors:
|
|
print()
|
|
for error in sorted(errors):
|
|
print(error, file=sys.stderr)
|
|
exit(1)
|
|
|