Param( [Parameter(Mandatory=$true)][string]$BaseUrl, [Parameter(Mandatory=$false)][string]$OutputPath = "sitemap.xml", [int]$MaxDepth = 3, [int]$RequestTimeout = 30 ) # Normalize base URL if (-not $BaseUrl.StartsWith('http')) { $BaseUrl = "https://$BaseUrl" } $baseUri = [System.Uri]::new($BaseUrl) # Set up collections $visited = New-Object System.Collections.Generic.HashSet[string] ([System.StringComparer]::OrdinalIgnoreCase) $foundUrls = New-Object System.Collections.Generic.List[string] function Get-AbsoluteUrl { param( [string]$href, [System.Uri]$parentUri ) try { $uri = $null if ([System.Uri]::IsWellFormedUriString($href, [System.UriKind]::Absolute)) { $uri = [System.Uri]::new($href) } else { $uri = New-Object System.Uri($parentUri, $href) } return $uri } catch { return $null } } function Extract-Links { param( [string]$html, [System.Uri]$parentUri ) $links = @() # Use regex to pull href values from tags $hrefPattern = '(?i)]*?\s+)?href\s*=\s*["'']?([^"''\s>]+)' foreach ($match in [regex]::Matches($html, $hrefPattern)) { $rawHref = $match.Groups[1].Value $absUri = Get-AbsoluteUrl -href $rawHref -parentUri $parentUri if ($null -ne $absUri -and $absUri.Scheme -match '^https?$') { # Only keep URLs that belong to the same host if ($absUri.Host -ieq $baseUri.Host) { $links += $absUri.AbsoluteUri } } } return $links } function Crawl { param( [string]$url, [int]$depth ) if ($depth -gt $MaxDepth) { return } if (-not $visited.Add($url)) { return } # already visited try { $response = Invoke-WebRequest -Uri $url -Method GET -TimeoutSec $RequestTimeout -ErrorAction Stop $foundUrls.Add($url) | Out-Null $pageLinks = Extract-Links -html $response.Content -parentUri ([System.Uri]::new($url)) foreach ($link in $pageLinks) { Crawl -url $link -depth ($depth + 1) } } catch { # Silently ignore failures (404, 500, timeout, etc.) } } # Start crawling from base URL Crawl -url $BaseUrl -depth 0 # Remove duplicates and sort $uniqueUrls = $foundUrls | Sort-Object -Unique # Build XML sitemap $sitemapNs = "http://www.sitemaps.org/schemas/sitemap/0.9" $xmlSettings = New-Object System.Xml.XmlWriterSettings $xmlSettings.Indent = $true $xmlSettings.Encoding = [System.Text.Encoding]::UTF8 $writer = [System.Xml.XmlWriter]::Create($OutputPath, $xmlSettings) $writer.WriteStartDocument() $writer.WriteStartElement("urlset", $sitemapNs) $now = Get-Date -Format "yyyy-MM-ddTHH:mm:ssK" foreach ($loc in $uniqueUrls) { $writer.WriteStartElement("url") $writer.WriteElementString("loc", $loc) $writer.WriteElementString("lastmod", $now) $writer.WriteEndElement() # url } $writer.WriteEndElement() # urlset $writer.WriteEndDocument() $writer.Flush() $writer.Close() Write-Host "Sitemap generated at $OutputPath with $($uniqueUrls.Count) URLs."