Param(
[Parameter(Mandatory=$true)][string]$BaseUrl,
[Parameter(Mandatory=$false)][string]$OutputPath = "sitemap.xml",
[int]$MaxDepth = 3,
[int]$RequestTimeout = 30
)
# Normalize base URL
if (-not $BaseUrl.StartsWith('http')) {
$BaseUrl = "https://$BaseUrl"
}
$baseUri = [System.Uri]::new($BaseUrl)
# Set up collections
$visited = New-Object System.Collections.Generic.HashSet[string] ([System.StringComparer]::OrdinalIgnoreCase)
$foundUrls = New-Object System.Collections.Generic.List[string]
function Get-AbsoluteUrl {
param(
[string]$href,
[System.Uri]$parentUri
)
try {
$uri = $null
if ([System.Uri]::IsWellFormedUriString($href, [System.UriKind]::Absolute)) {
$uri = [System.Uri]::new($href)
} else {
$uri = New-Object System.Uri($parentUri, $href)
}
return $uri
} catch {
return $null
}
}
function Extract-Links {
param(
[string]$html,
[System.Uri]$parentUri
)
$links = @()
# Use regex to pull href values from tags
$hrefPattern = '(?i)]*?\s+)?href\s*=\s*["'']?([^"''\s>]+)'
foreach ($match in [regex]::Matches($html, $hrefPattern)) {
$rawHref = $match.Groups[1].Value
$absUri = Get-AbsoluteUrl -href $rawHref -parentUri $parentUri
if ($null -ne $absUri -and $absUri.Scheme -match '^https?$') {
# Only keep URLs that belong to the same host
if ($absUri.Host -ieq $baseUri.Host) {
$links += $absUri.AbsoluteUri
}
}
}
return $links
}
function Crawl {
param(
[string]$url,
[int]$depth
)
if ($depth -gt $MaxDepth) { return }
if (-not $visited.Add($url)) { return } # already visited
try {
$response = Invoke-WebRequest -Uri $url -Method GET -TimeoutSec $RequestTimeout -ErrorAction Stop
$foundUrls.Add($url) | Out-Null
$pageLinks = Extract-Links -html $response.Content -parentUri ([System.Uri]::new($url))
foreach ($link in $pageLinks) {
Crawl -url $link -depth ($depth + 1)
}
} catch {
# Silently ignore failures (404, 500, timeout, etc.)
}
}
# Start crawling from base URL
Crawl -url $BaseUrl -depth 0
# Remove duplicates and sort
$uniqueUrls = $foundUrls | Sort-Object -Unique
# Build XML sitemap
$sitemapNs = "http://www.sitemaps.org/schemas/sitemap/0.9"
$xmlSettings = New-Object System.Xml.XmlWriterSettings
$xmlSettings.Indent = $true
$xmlSettings.Encoding = [System.Text.Encoding]::UTF8
$writer = [System.Xml.XmlWriter]::Create($OutputPath, $xmlSettings)
$writer.WriteStartDocument()
$writer.WriteStartElement("urlset", $sitemapNs)
$now = Get-Date -Format "yyyy-MM-ddTHH:mm:ssK"
foreach ($loc in $uniqueUrls) {
$writer.WriteStartElement("url")
$writer.WriteElementString("loc", $loc)
$writer.WriteElementString("lastmod", $now)
$writer.WriteEndElement() # url
}
$writer.WriteEndElement() # urlset
$writer.WriteEndDocument()
$writer.Flush()
$writer.Close()
Write-Host "Sitemap generated at $OutputPath with $($uniqueUrls.Count) URLs."