param( [Parameter(Mandatory=$true)] [string]$BaseUrl, [int]$MaxDepth = 3, [string]$OutputPath = "sitemap.xml" ) # Normalize base URL (ensure trailing slash for relative resolution) if (-not $BaseUrl.EndsWith('/')) { $BaseUrl = "$BaseUrl/" } # Create collections for tracking $visited = New-Object System.Collections.Generic.HashSet[string] $queue = New-Object System.Collections.Generic.Queue[object] # Enqueue the start URL with depth 0 $queue.Enqueue([pscustomobject]@{ Url = $BaseUrl; Depth = 0 }) while ($queue.Count -gt 0) { $item = $queue.Dequeue() $url = $item.Url $depth = $item.Depth # Skip if already visited or beyond depth limit if ($visited.Contains($url) -or $depth -gt $MaxDepth) { continue } try { # Attempt a GET request (HTML needed for link extraction) $response = Invoke-WebRequest -Uri $url -UseBasicParsing -Method Get -ErrorAction Stop } catch { # On error (404, timeout, etc.), just skip processing this URL continue } # Mark as visited $null = $visited.Add($url) # Only parse HTML content-type pages for further links if ($response.Content -and $response.ContentType -match 'text/html') { # Simple regex to extract href values; may capture duplicates/invalid values $hrefs = [regex]::Matches($response.Content, 'href\s*=\s*["'']([^"''>#]+)["'']', 'IgnoreCase') | ForEach-Object { $_.Groups[1].Value.Trim() } foreach ($href in $hrefs) { # Skip empty, mailto:, javascript:, tel:, # if ([string]::IsNullOrWhiteSpace($href) -or $href -match '^\s*(mailto:|javascript:|tel:|#)') { continue } # Resolve relative URLs against the current page URL try { $resolved = (New-Object System.Uri($url, $href)).AbsoluteUri } catch { continue } # Ensure we stay within the original domain (optional but typical for sitemaps) try { $baseHost = (New-Object System.Uri($BaseUrl)).Host $resolvedHost = (New-Object System.Uri($resolved)).Host if ($resolvedHost -ne $baseHost) { continue } } catch { continue } # Enqueue for later crawling if not visited yet if (-not $visited.Contains($resolved)) { $queue.Enqueue([pscustomobject]@{ Url = $resolved; Depth = $depth + 1 }) } } } } # Build the sitemap XML $ns = "http://www.sitemaps.org/schemas/sitemap/0.9" $xmlDoc = New-Object System.Xml.Linq.XDocument $urlset = New-Object System.Xml.Linq.XElement ([System.Xml.Linq.XName]::Get("urlset", $ns)) foreach ($loc in $visited) { $urlElement = New-Object System.Xml.Linq.XElement ([System.Xml.Linq.XName]::Get("url", $ns)) $locElement = New-Object System.Xml.Linq.XElement ([System.Xml.Linq.XName]::Get("loc", $ns), $loc) $urlElement.Add($locElement) $urlset.Add($urlElement) } $xmlDoc.Add($urlset) # Save to file try { $xmlDoc.Save($OutputPath) Write-Host "Sitemap generated at '$OutputPath' with $($visited.Count) URLs." } catch { Write-Error "Failed to write sitemap file: $_" }