discord-bot/get-names_anilist.ps1

125 lines
3.3 KiB
PowerShell
Raw Permalink Normal View History

2021-03-09 18:04:57 +00:00
#!/usr/bin/pwsh
<#
.SYNOPSIS
Script to crawl AniList for character names
2022-12-03 06:55:59 +00:00
.PARAMETER startPage
Start page number.
Default is 0.
2021-03-09 18:04:57 +00:00
.PARAMETER output
Output filename.
Default is names_anilist.txt.
#>
param(
2022-12-03 06:55:59 +00:00
[int]$startPage = 0,
2021-03-09 18:04:57 +00:00
[string]$output = 'names_anilist.txt'
)
# disable extra noise for invoke-webrequest
#$ProgressPreference = "SilentlyContinue"
$requestTemplate = '{ "query": "query { Page (page: {0}, perPage: 50) { characters { name { full } } pageInfo { hasNextPage lastPage } } }" }'
2022-12-03 06:55:59 +00:00
$page = $startPage
$total = 2565
2021-03-09 18:04:57 +00:00
$result = @()
$hasNextPage = $false
2022-12-03 06:55:59 +00:00
function Request-Page($num)
{
$json = $requestTemplate.Replace('{0}', $num)
$response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" -TimeoutSec 30
Start-Sleep -Seconds 1.5
return $response.data.Page
}
function Get-TotalPageCount($start)
{
$min = $start
$max = $start + 3000
2022-12-03 06:55:59 +00:00
do
{
$response = Request-Page $max
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
$max += 1000
} while ($response.pageInfo.hasNextPage)
while ($min -lt $max)
{
$middle = $min + [int](($max - $min) / 2)
$response = Request-Page $middle
if ($response.pageInfo.hasNextPage)
{
$min = $middle
}
elseif ($response.characters.Count -eq 0)
{
$max = $middle
}
else
{
return $middle
}
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
}
return $middle
}
try
{
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
$total = Get-TotalPageCount $startPage
}
catch
{
Write-Warning "Failed to count total number of pages, using default [$total]"
}
$startTime = [DateTime]::UtcNow
$remainingSeconds = ($total - $page) * 1.7
2021-03-09 18:04:57 +00:00
do
{
try
{
2022-12-03 06:55:59 +00:00
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
$response = Request-Page $page
2021-03-09 18:04:57 +00:00
$hasNextPage = $response.pageInfo.hasNextPage
$chars = $response.characters
foreach ($char in $chars)
{
2022-12-03 06:55:59 +00:00
$name = $char.name.full
if ($null -eq $name) { continue }
$name = $char.name.full.Replace(' ', ' ').Trim()
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
{
Write-Host "Skipping $name"
continue
}
$result += $name
2021-03-09 18:04:57 +00:00
}
$page++
2022-12-03 06:55:59 +00:00
$total = [Math]::Max($total, $response.pageInfo.lastPage)
if (($page - $startPage) -gt 60)
{
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
}
else
2021-03-09 18:04:57 +00:00
{
2022-12-03 06:55:59 +00:00
$remainingSeconds = ($total - $page) * 1.7
2021-03-09 18:04:57 +00:00
}
}
catch
{
2022-12-03 06:55:59 +00:00
Write-Error "Failed to request page $page`: $_"
2021-03-09 18:04:57 +00:00
$hasNextPage = $false
}
} while ($hasNextPage)
2022-12-03 06:55:59 +00:00
Write-Host "Stopped on page $page"
Write-Progress -Activity "Downloading" -Completed
2021-03-09 18:04:57 +00:00
Write-Host 'Saving the results...'
'# https://anilist.co/search/characters' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Write-Host 'Done.'