2021-03-09 18:04:57 +00:00
|
|
|
#!/usr/bin/pwsh
|
|
|
|
<#
|
|
|
|
.SYNOPSIS
|
|
|
|
Script to crawl AniList for character names
|
2022-12-03 06:55:59 +00:00
|
|
|
.PARAMETER startPage
|
|
|
|
Start page number.
|
|
|
|
Default is 0.
|
2021-03-09 18:04:57 +00:00
|
|
|
.PARAMETER output
|
|
|
|
Output filename.
|
|
|
|
Default is names_anilist.txt.
|
|
|
|
#>
|
|
|
|
|
|
|
|
param(
|
2022-12-03 06:55:59 +00:00
|
|
|
[int]$startPage = 0,
|
2021-03-09 18:04:57 +00:00
|
|
|
[string]$output = 'names_anilist.txt'
|
|
|
|
)
|
|
|
|
|
|
|
|
# disable extra noise for invoke-webrequest
|
|
|
|
#$ProgressPreference = "SilentlyContinue"
|
|
|
|
|
|
|
|
$requestTemplate = '{ "query": "query { Page (page: {0}, perPage: 50) { characters { name { full } } pageInfo { hasNextPage lastPage } } }" }'
|
2022-12-03 06:55:59 +00:00
|
|
|
$page = $startPage
|
|
|
|
$total = 2565
|
2021-03-09 18:04:57 +00:00
|
|
|
$result = @()
|
|
|
|
$hasNextPage = $false
|
2022-12-03 06:55:59 +00:00
|
|
|
|
|
|
|
function Request-Page($num)
|
|
|
|
{
|
|
|
|
$json = $requestTemplate.Replace('{0}', $num)
|
|
|
|
$response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" -TimeoutSec 30
|
|
|
|
Start-Sleep -Seconds 1.5
|
|
|
|
return $response.data.Page
|
|
|
|
}
|
|
|
|
|
|
|
|
function Get-TotalPageCount($start)
|
|
|
|
{
|
|
|
|
$min = $start
|
2022-12-03 10:37:02 +00:00
|
|
|
$max = $start + 3000
|
2022-12-03 06:55:59 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
$response = Request-Page $max
|
|
|
|
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
|
|
|
|
$max += 1000
|
|
|
|
} while ($response.pageInfo.hasNextPage)
|
|
|
|
|
|
|
|
while ($min -lt $max)
|
|
|
|
{
|
|
|
|
$middle = $min + [int](($max - $min) / 2)
|
|
|
|
$response = Request-Page $middle
|
|
|
|
if ($response.pageInfo.hasNextPage)
|
|
|
|
{
|
|
|
|
$min = $middle
|
|
|
|
}
|
|
|
|
elseif ($response.characters.Count -eq 0)
|
|
|
|
{
|
|
|
|
$max = $middle
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return $middle
|
|
|
|
}
|
|
|
|
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
|
|
|
|
}
|
|
|
|
return $middle
|
|
|
|
}
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
|
|
|
|
$total = Get-TotalPageCount $startPage
|
|
|
|
}
|
|
|
|
catch
|
|
|
|
{
|
|
|
|
Write-Warning "Failed to count total number of pages, using default [$total]"
|
|
|
|
}
|
|
|
|
|
|
|
|
$startTime = [DateTime]::UtcNow
|
|
|
|
$remainingSeconds = ($total - $page) * 1.7
|
2021-03-09 18:04:57 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
2022-12-03 06:55:59 +00:00
|
|
|
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
|
|
|
|
$response = Request-Page $page
|
2021-03-09 18:04:57 +00:00
|
|
|
$hasNextPage = $response.pageInfo.hasNextPage
|
|
|
|
$chars = $response.characters
|
|
|
|
foreach ($char in $chars)
|
|
|
|
{
|
2022-12-03 06:55:59 +00:00
|
|
|
$name = $char.name.full
|
|
|
|
if ($null -eq $name) { continue }
|
|
|
|
|
|
|
|
$name = $char.name.full.Replace(' ', ' ').Trim()
|
|
|
|
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
|
|
|
|
{
|
|
|
|
Write-Host "Skipping $name"
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
$result += $name
|
2021-03-09 18:04:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
$page++
|
2022-12-03 06:55:59 +00:00
|
|
|
$total = [Math]::Max($total, $response.pageInfo.lastPage)
|
|
|
|
if (($page - $startPage) -gt 60)
|
|
|
|
{
|
|
|
|
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
|
|
|
|
}
|
|
|
|
else
|
2021-03-09 18:04:57 +00:00
|
|
|
{
|
2022-12-03 06:55:59 +00:00
|
|
|
$remainingSeconds = ($total - $page) * 1.7
|
2021-03-09 18:04:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
catch
|
|
|
|
{
|
2022-12-03 06:55:59 +00:00
|
|
|
Write-Error "Failed to request page $page`: $_"
|
2021-03-09 18:04:57 +00:00
|
|
|
$hasNextPage = $false
|
|
|
|
}
|
|
|
|
} while ($hasNextPage)
|
2022-12-03 06:55:59 +00:00
|
|
|
Write-Host "Stopped on page $page"
|
|
|
|
Write-Progress -Activity "Downloading" -Completed
|
2021-03-09 18:04:57 +00:00
|
|
|
|
|
|
|
Write-Host 'Saving the results...'
|
|
|
|
'# https://anilist.co/search/characters' | Out-File -LiteralPath $output
|
|
|
|
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
|
|
|
|
|
|
|
|
Write-Host 'Done.'
|