discord-bot/get-names_anidb.ps1

173 lines
5.1 KiB
PowerShell
Raw Permalink Normal View History

#!/usr/bin/pwsh
<#
.SYNOPSIS
Script to crawl AniDB for character names
2022-12-03 06:55:59 +00:00
.PARAMETER includeMecha
If enabled, include mecha names in the list.
Default is not enabled.
.PARAMETER startpage
Letter page to start from. Asume default number of rows per page.
Default is 0.
.PARAMETER output
Output filename.
Default is names_anidb.txt.
#>
param(
2022-12-03 06:55:59 +00:00
[switch]$includeMecha = $false,
[int]$startpage = 0,
[string]$output = 'names_anidb.txt'
)
# disable extra noise for invoke-webrequest
2022-12-03 06:55:59 +00:00
#$ProgressPreference = "SilentlyContinue"
$page = $startpage
$total = 3971
$result = @()
$hasNextPage = $false
# get anonymous sesssion
Invoke-WebRequest "https://anidb.net" -SessionVariable 'Session' | Out-Null
Start-Sleep -Seconds 2
2022-12-03 06:55:59 +00:00
function Request-Page($num)
{
2022-12-03 06:55:59 +00:00
# all template: https://anidb.net/character/?noalias=1&orderby.name=0.1&page=1&view=list
# letter template: https://anidb.net/character/?char=a&noalias=1&orderby.name=0.1&page=1&view=list
$url = "https://anidb.net/character/?noalias=1&orderby.name=0.1&page=$num&view=list"
$response = Invoke-WebRequest $url -WebSession $Session
#$links = @($response.links | Where-Object { $_.href -match '/character/\d+$' } | Where-Object { $_.outerHTML -match '^<a [^>]+>[^<][^\n]+</a>$' })
Start-Sleep -Seconds 3
return $response.Content
}
function Get-TotalPageCount($start)
{
$min = $start
$max = $start + 5000
2022-12-03 06:55:59 +00:00
do
{
2022-12-03 06:55:59 +00:00
$html = Request-Page $max
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
$max += 1000
} while ($html.contains('>next</a></li>'))
2022-12-03 06:55:59 +00:00
while ($min -lt $max)
{
2022-12-03 06:55:59 +00:00
$middle = $min + [int](($max - $min) / 2)
$html = Request-Page $middle
if ($html.Contains('>next</a></li>'))
{
$min = $middle
}
elseif ($html.Contains('<div class="container">No results.'))
{
$max = $middle
}
else
{
return $middle
}
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
}
2022-12-03 06:55:59 +00:00
return $middle
}
2022-12-03 06:55:59 +00:00
try
{
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
$total = Get-TotalPageCount $startPage
}
catch
{
Write-Warning "Failed to count total number of pages, using default [$total]"
}
$startTime = [DateTime]::UtcNow
$remainingSeconds = ($total - $page) * 1.7
do
{
try
{
2022-12-03 06:55:59 +00:00
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
$html = Request-Page $page
$hasNextPage = $html.Contains('>next</a></li>')
$pos = $html.IndexOf('<table class="characterlist">')
if ($pos -lt 1)
{
2022-12-03 06:55:59 +00:00
if ($html.Contains('Please unban me'))
{
2022-12-03 06:55:59 +00:00
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Read-Host -Prompt 'Rate limited, plz unban...'
continue
}
else
{
Write-Host $html
Write-Error "This script needs updating"
exit -1
}
2022-12-03 06:55:59 +00:00
}
2022-12-03 06:55:59 +00:00
do
{
$pos = $html.IndexOf('<td data-label="Title"', $pos)
if ($pos -lt 0)
{
2022-12-03 06:55:59 +00:00
break
}
$pos = $html.IndexOf('<a href=', $pos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</a></td>', $pos)
$name = $html.Substring($pos + 1, $endPos - $pos - 1).Replace(' ', ' ').Trim()
2022-12-03 06:55:59 +00:00
$pos = $html.IndexOf('<td data-label="Type"', $endPos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</td>', $pos)
$type = $html.Substring($pos + 1, $endPos - $pos - 1)
2022-12-03 06:55:59 +00:00
if (($type -ieq 'Character') -or ($includeMecha -and ($type -ieq 'Mecha')))
{
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
{
2022-12-03 06:55:59 +00:00
Write-Host "Skipping $name"
continue
}
2022-12-03 06:55:59 +00:00
$result += $name
}
else
{
Write-Host "Skipping $name ($type)"
}
} until ($pos -lt 0)
$page++
$total = [Math]::Max($total, $response.pageInfo.lastPage)
if (($page - $startPage) -gt 60)
{
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
}
2022-12-03 06:55:59 +00:00
else
{
2022-12-03 06:55:59 +00:00
$remainingSeconds = ($total - $page) * 1.7
}
2022-12-03 06:55:59 +00:00
}
catch
{
Write-Host "Failed to fetch the page" -ForegroundColor Yellow
$hasNextPage = $false
}
} while ($hasNextPage)
Write-Host "Stopped on page $page"
Write-Progress -Activity "Downloading" -Completed
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Write-Host 'Done.'