diff --git a/get-names_anidb.ps1 b/get-names_anidb.ps1
index 78cf9459..f073cf74 100644
--- a/get-names_anidb.ps1
+++ b/get-names_anidb.ps1
@@ -2,9 +2,9 @@
<#
.SYNOPSIS
Script to crawl AniDB for character names
-.PARAMETER startletter
-Letter to start from (a-z). Useful for resuming.
-Default is a.
+.PARAMETER includeMecha
+If enabled, include mecha names in the list.
+Default is not enabled.
.PARAMETER startpage
Letter page to start from. Asume default number of rows per page.
Default is 0.
@@ -13,100 +13,159 @@ Output filename.
Default is names_anidb.txt.
#>
param(
- [char]$startletter = 'a',
+ [switch]$includeMecha = $false,
[int]$startpage = 0,
[string]$output = 'names_anidb.txt'
)
# disable extra noise for invoke-webrequest
-$ProgressPreference = "SilentlyContinue"
+#$ProgressPreference = "SilentlyContinue"
+
+$page = $startpage
+$total = 3971
+$result = @()
+$hasNextPage = $false
# get anonymous sesssion
Invoke-WebRequest "https://anidb.net" -SessionVariable 'Session' | Out-Null
Start-Sleep -Seconds 2
-$result = @()
-foreach ($letter in 'a'..'z')
+function Request-Page($num)
{
- if ($letter -lt $startletter)
- {
- continue
- }
+ # all template: https://anidb.net/character/?noalias=1&orderby.name=0.1&page=1&view=list
+ # letter template: https://anidb.net/character/?char=a&noalias=1&orderby.name=0.1&page=1&view=list
+ $url = "https://anidb.net/character/?noalias=1&orderby.name=0.1&page=$num&view=list"
+ $response = Invoke-WebRequest $url -WebSession $Session
+ #$links = @($response.links | Where-Object { $_.href -match '/character/\d+$' } | Where-Object { $_.outerHTML -match '^]+>[^<][^\n]+$' })
+ Start-Sleep -Seconds 3
+ return $response.Content
+}
- $page = 0
- if ($letter -eq $startletter)
- {
- $page = $startpage
- }
-
- $hasNextPage = $false
+function Get-TotalPageCount($start)
+{
+ $min = $start
+ $max = $start + 1000
do
{
- try
+ $html = Request-Page $max
+ Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
+ $max += 1000
+ } while ($html.contains('>next'))
+
+ while ($min -lt $max)
+ {
+ $middle = $min + [int](($max - $min) / 2)
+ $html = Request-Page $middle
+ if ($html.Contains('>next'))
{
- Write-Host "Requesting letter $letter, page $page..."
- $url = "https://anidb.net/character/?char=$letter&noalias=1&orderby.name=0.1&view=list&page=$page"
- $response = Invoke-WebRequest $url -WebSession $Session
- $html = $response.content
- $hasNextPage = $html.contains('>next')
- $pos = $html.IndexOf('
')
- if ($pos -lt 1)
- {
- if ($html.Contains('Please unban me'))
- {
- Write-Host 'Saving the results...'
- '# https://anidb.net/character' | Out-File -LiteralPath $output
- $result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
- Read-Host -Prompt 'Rate limited, plz unban...'
- continue
- }
- else
- {
- Write-Host $html
- Write-Error "This script needs updating"
- exit -1
- }
- }
-
- do
- {
- $pos = $html.IndexOf('', $pos)
- $endPos = $html.IndexOf(' | ', $pos)
- $name = $html.Substring($pos + 1, $endPos - $pos - 1).Trim()
-
- $pos = $html.IndexOf('', $pos)
- $endPos = $html.IndexOf(' | ', $pos)
- $type = $html.Substring($pos + 1, $endPos - $pos - 1)
-
- if ($type -ieq 'Character') # consider adding Mecha
- {
- $result += $name
- }
- else
- {
- Write-Host "Skipped $name ($type)"
- }
- } until ($pos -lt 0)
-
- $page++
- Start-Sleep -Seconds 2 # increase if needed
+ $min = $middle
}
- catch
+ elseif ($html.Contains('No results.'))
{
- Write-Host "Failed to fetch the page" -ForegroundColor Yellow
+ $max = $middle
}
-
- } while ($hasNextPage)
+ else
+ {
+ return $middle
+ }
+ Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
+ }
+ return $middle
}
+try
+{
+ Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
+ $total = Get-TotalPageCount $startPage
+}
+catch
+{
+ Write-Warning "Failed to count total number of pages, using default [$total]"
+}
+
+$startTime = [DateTime]::UtcNow
+$remainingSeconds = ($total - $page) * 1.7
+
+do
+{
+ try
+ {
+ Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
+ $html = Request-Page $page
+ $hasNextPage = $html.Contains('>next')
+ $pos = $html.IndexOf('
')
+ if ($pos -lt 1)
+ {
+ if ($html.Contains('Please unban me'))
+ {
+ Write-Host 'Saving the results...'
+ '# https://anidb.net/character' | Out-File -LiteralPath $output
+ $result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
+ Read-Host -Prompt 'Rate limited, plz unban...'
+ continue
+ }
+ else
+ {
+ Write-Host $html
+ Write-Error "This script needs updating"
+ exit -1
+ }
+ }
+
+ do
+ {
+ $pos = $html.IndexOf('', $pos)
+ $endPos = $html.IndexOf(' | ', $pos)
+ $name = $html.Substring($pos + 1, $endPos - $pos - 1).Replace(' ', ' ').Trim()
+
+ $pos = $html.IndexOf('', $pos)
+ $endPos = $html.IndexOf(' | ', $pos)
+ $type = $html.Substring($pos + 1, $endPos - $pos - 1)
+
+ if (($type -ieq 'Character') -or ($includeMecha -and ($type -ieq 'Mecha')))
+ {
+ if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
+ {
+ Write-Host "Skipping $name"
+ continue
+ }
+
+ $result += $name
+ }
+ else
+ {
+ Write-Host "Skipping $name ($type)"
+ }
+ } until ($pos -lt 0)
+
+ $page++
+ $total = [Math]::Max($total, $response.pageInfo.lastPage)
+ if (($page - $startPage) -gt 60)
+ {
+ $remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
+ }
+ else
+ {
+ $remainingSeconds = ($total - $page) * 1.7
+ }
+ }
+ catch
+ {
+ Write-Host "Failed to fetch the page" -ForegroundColor Yellow
+ $hasNextPage = $false
+ }
+} while ($hasNextPage)
+Write-Host "Stopped on page $page"
+Write-Progress -Activity "Downloading" -Completed
+
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
diff --git a/get-names_anilist.ps1 b/get-names_anilist.ps1
index 0cbd7b46..ae00c8cc 100644
--- a/get-names_anilist.ps1
+++ b/get-names_anilist.ps1
@@ -2,12 +2,16 @@
<#
.SYNOPSIS
Script to crawl AniList for character names
+.PARAMETER startPage
+Start page number.
+Default is 0.
.PARAMETER output
Output filename.
Default is names_anilist.txt.
#>
param(
+ [int]$startPage = 0,
[string]$output = 'names_anilist.txt'
)
@@ -15,41 +19,104 @@ param(
#$ProgressPreference = "SilentlyContinue"
$requestTemplate = '{ "query": "query { Page (page: {0}, perPage: 50) { characters { name { full } } pageInfo { hasNextPage lastPage } } }" }'
-$startTime = [DateTime]::UtcNow
-$page = 0
+$page = $startPage
+$total = 2565
$result = @()
$hasNextPage = $false
+
+function Request-Page($num)
+{
+ $json = $requestTemplate.Replace('{0}', $num)
+ $response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" -TimeoutSec 30
+ Start-Sleep -Seconds 1.5
+ return $response.data.Page
+}
+
+function Get-TotalPageCount($start)
+{
+ $min = $start
+ $max = $start + 1000
+ do
+ {
+ $response = Request-Page $max
+ Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
+ $max += 1000
+ } while ($response.pageInfo.hasNextPage)
+
+ while ($min -lt $max)
+ {
+ $middle = $min + [int](($max - $min) / 2)
+ $response = Request-Page $middle
+ if ($response.pageInfo.hasNextPage)
+ {
+ $min = $middle
+ }
+ elseif ($response.characters.Count -eq 0)
+ {
+ $max = $middle
+ }
+ else
+ {
+ return $middle
+ }
+ Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
+ }
+ return $middle
+}
+
+try
+{
+ Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
+ $total = Get-TotalPageCount $startPage
+}
+catch
+{
+ Write-Warning "Failed to count total number of pages, using default [$total]"
+}
+
+$startTime = [DateTime]::UtcNow
+$remainingSeconds = ($total - $page) * 1.7
do
{
try
{
- $json = $requestTemplate.Replace('{0}', $page)
- $response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json"
- $response = $response.data.Page
+ Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
+ $response = Request-Page $page
$hasNextPage = $response.pageInfo.hasNextPage
$chars = $response.characters
foreach ($char in $chars)
{
- $result += $char.name.full.Trim()
+ $name = $char.name.full
+ if ($null -eq $name) { continue }
+
+ $name = $char.name.full.Replace(' ', ' ').Trim()
+ if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
+ {
+ Write-Host "Skipping $name"
+ continue
+ }
+ $result += $name
}
$page++
- $total = $response.pageInfo.lastPage
- $remainingSeconds = ($total - $page) * 1.7
- if ($page -gt 100)
+ $total = [Math]::Max($total, $response.pageInfo.lastPage)
+ if (($page - $startPage) -gt 60)
{
- $remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / $page * ($total - $page)
+ $remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
+ }
+ else
+ {
+ $remainingSeconds = ($total - $page) * 1.7
}
- Write-Progress Downloading -CurrentOperation "Page $page out of $total" -PercentComplete ($page * 100 / $total) -SecondsRemaining $remainingSeconds
- Start-Sleep -Seconds 1
}
catch
{
- Write-Host "Failed to request page $page"
+ Write-Error "Failed to request page $page`: $_"
$hasNextPage = $false
}
} while ($hasNextPage)
-Write-Progress Downloading -Completed
+Write-Host "Stopped on page $page"
+Write-Progress -Activity "Downloading" -Completed
Write-Host 'Saving the results...'
'# https://anilist.co/search/characters' | Out-File -LiteralPath $output