A tiny dependency-free CLI that finds and checks links in Markdown/text files.
# linkrot-lantern 🕯️ A tiny CLI that finds links in Markdown/text files and checks whether they still answer. No dependencies. No account. No drama. Point it at docs before you publish and let the little lantern look for dead links. ## Install ```bash git clone https://github.com/bricktheceo/linkrot-lantern.git cd linkrot-lantern python3 src/linkrot_lantern.py README.md ``` ## Usage ```bash # Check one or more files python3 src/linkrot_lantern.py README.md docs/*.md # Read from stdin cat README.md | python3 src/linkrot_lantern.py - # JSON output python3 src/linkrot_lantern.py --json README.md # Include localhost/private links too python3 src/linkrot_lantern.py --include-private README.md ``` ## What counts as private? By default it skips localhost, private IPs, `.local`, and obvious intranet URLs so you do not accidentally poke your house while checking public docs. ## Exit codes - `0` all checked links looked alive or were skipped - `1` one or more links failed - `2` usage/input problem ## License MIT
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import concurrent.futures as futures
import ipaddress
import json
import re
import socket
import sys
import urllib.error
import urllib.request
from dataclasses import asdict, dataclass
from pathlib import Path
from urllib.parse import urlparse
URL_RE = re.compile(r"https?://[^\s)\]>'\"}]+")
@dataclass
class LinkResult:
url: str
source: str
status: str
code: int | None = None
reason: str = ""
def is_private_host(host: str) -> bool:
h = host.lower().strip("[]")
if h in {"localhost", "0.0.0.0"} or h.endswith(".local"):
return True
try:
ip = ipaddress.ip_address(h)
return ip.is_private or ip.is_loopback or ip.is_link_local
except ValueError:
pass
try:
infos = socket.getaddrinfo(h, None, proto=socket.IPPROTO_TCP)
for info in infos[:3]:
ip = ipaddress.ip_address(info[4][0])
if ip.is_private or ip.is_loopback or ip.is_link_local:
return True
except Exception:
return False
return False
def extract_links(text: str) -> list[str]:
seen = set()
out = []
for match in URL_RE.findall(text):
url = match.rstrip(".,;:!?)]")
if url not in seen:
seen.add(url)
out.append(url)
return out
def read_sources(paths: list[str]) -> list[tuple[str, str]]:
if not paths:
return [("<stdin>", sys.stdin.read())]
sources = []
for raw in paths:
if raw == "-":
sources.append(("<stdin>", sys.stdin.read()))
continue
path = Path(raw)
if not path.exists():
raise SystemExit(f"missing file: {raw}")
sources.append((raw, path.read_text(encoding="utf-8", errors="replace")))
return sources
def check(url: str, source: str, timeout: float, include_private: bool) -> LinkResult:
parsed = urlparse(url)
if not parsed.hostname:
return LinkResult(url, source, "bad-url", reason="missing hostname")
if not include_private and is_private_host(parsed.hostname):
return LinkResult(url, source, "skipped", reason="private/local host")
headers = {"User-Agent": "linkrot-lantern/0.1 (+https://github.com/bricktheceo/linkrot-lantern)"}
for method in ("HEAD", "GET"):
req = urllib.request.Request(url, method=method, headers=headers)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
code = int(resp.status)
status = "ok" if code < 400 else "fail"
return LinkResult(url, source, status, code=code)
except urllib.error.HTTPError as e:
# Some servers hate HEAD; retry GET before judging.
if method == "HEAD" and e.code in {403, 405, 429}:
continue
return LinkResult(url, source, "fail", code=e.code, reason=str(e.reason))
except Exception as e:
if method == "HEAD":
continue
return LinkResult(url, source, "fail", reason=e.__class__.__name__)
return LinkResult(url, source, "fail", reason="unknown")
def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description="Find and check links in Markdown/text files.")
ap.add_argument("paths", nargs="*", help="Files to scan, or - for stdin")
ap.add_argument("--json", action="store_true", help="Emit JSON instead of a table")
ap.add_argument("--include-private", action="store_true", help="Do not skip localhost/private hosts")
ap.add_argument("--timeout", type=float, default=8.0, help="Per-request timeout in seconds")
ap.add_argument("--workers", type=int, default=8, help="Concurrent link checks")
args = ap.parse_args(argv[1:])
sources = read_sources(args.paths)
jobs: list[tuple[str, str]] = []
for source, text in sources:
jobs.extend((url, source) for url in extract_links(text))
results: list[LinkResult] = []
with futures.ThreadPoolExecutor(max_workers=max(1, args.workers)) as pool:
pending = [pool.submit(check, url, source, args.timeout, args.include_private) for url, source in jobs]
for fut in futures.as_completed(pending):
results.append(fut.result())
results.sort(key=lambda r: (r.source, r.url))
if args.json:
print(json.dumps([asdict(r) for r in results], indent=2))
else:
if not results:
print("No links found.")
for r in results:
code = "" if r.code is None else f" {r.code}"
reason = "" if not r.reason else f" — {r.reason}"
print(f"{r.status.upper():7} {r.source}: {r.url}{code}{reason}")
return 1 if any(r.status == "fail" for r in results) else 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv))