update some scripts

This commit is contained in:
13xforever 2022-12-03 11:55:59 +05:00
parent deb8b5bbd2
commit 07b2bd68ae
No known key found for this signature in database
GPG Key ID: 2B2A36B482FE70C5
2 changed files with 217 additions and 91 deletions

View File

@ -2,9 +2,9 @@
<#
.SYNOPSIS
Script to crawl AniDB for character names
.PARAMETER startletter
Letter to start from (a-z). Useful for resuming.
Default is a.
.PARAMETER includeMecha
If enabled, include mecha names in the list.
Default is not enabled.
.PARAMETER startpage
Letter page to start from. Asume default number of rows per page.
Default is 0.
@ -13,100 +13,159 @@ Output filename.
Default is names_anidb.txt.
#>
param(
[char]$startletter = 'a',
[switch]$includeMecha = $false,
[int]$startpage = 0,
[string]$output = 'names_anidb.txt'
)
# disable extra noise for invoke-webrequest
$ProgressPreference = "SilentlyContinue"
#$ProgressPreference = "SilentlyContinue"
$page = $startpage
$total = 3971
$result = @()
$hasNextPage = $false
# get anonymous sesssion
Invoke-WebRequest "https://anidb.net" -SessionVariable 'Session' | Out-Null
Start-Sleep -Seconds 2
$result = @()
foreach ($letter in 'a'..'z')
function Request-Page($num)
{
if ($letter -lt $startletter)
{
continue
}
# all template: https://anidb.net/character/?noalias=1&orderby.name=0.1&page=1&view=list
# letter template: https://anidb.net/character/?char=a&noalias=1&orderby.name=0.1&page=1&view=list
$url = "https://anidb.net/character/?noalias=1&orderby.name=0.1&page=$num&view=list"
$response = Invoke-WebRequest $url -WebSession $Session
#$links = @($response.links | Where-Object { $_.href -match '/character/\d+$' } | Where-Object { $_.outerHTML -match '^<a [^>]+>[^<][^\n]+</a>$' })
Start-Sleep -Seconds 3
return $response.Content
}
$page = 0
if ($letter -eq $startletter)
{
$page = $startpage
}
$hasNextPage = $false
function Get-TotalPageCount($start)
{
$min = $start
$max = $start + 1000
do
{
try
$html = Request-Page $max
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
$max += 1000
} while ($html.contains('>next</a></li>'))
while ($min -lt $max)
{
$middle = $min + [int](($max - $min) / 2)
$html = Request-Page $middle
if ($html.Contains('>next</a></li>'))
{
Write-Host "Requesting letter $letter, page $page..."
$url = "https://anidb.net/character/?char=$letter&noalias=1&orderby.name=0.1&view=list&page=$page"
$response = Invoke-WebRequest $url -WebSession $Session
$html = $response.content
$hasNextPage = $html.contains('>next</a></li>')
$pos = $html.IndexOf('<table class="characterlist">')
if ($pos -lt 1)
{
if ($html.Contains('Please unban me'))
{
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Read-Host -Prompt 'Rate limited, plz unban...'
continue
}
else
{
Write-Host $html
Write-Error "This script needs updating"
exit -1
}
}
do
{
$pos = $html.IndexOf('<td data-label="Title"', $pos)
if ($pos -lt 0)
{
break
}
$pos = $html.IndexOf('<a href=', $pos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</a></td>', $pos)
$name = $html.Substring($pos + 1, $endPos - $pos - 1).Trim()
$pos = $html.IndexOf('<td data-label="Type"', $endPos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</td>', $pos)
$type = $html.Substring($pos + 1, $endPos - $pos - 1)
if ($type -ieq 'Character') # consider adding Mecha
{
$result += $name
}
else
{
Write-Host "Skipped $name ($type)"
}
} until ($pos -lt 0)
$page++
Start-Sleep -Seconds 2 # increase if needed
$min = $middle
}
catch
elseif ($html.Contains('<div class="container">No results.'))
{
Write-Host "Failed to fetch the page" -ForegroundColor Yellow
$max = $middle
}
} while ($hasNextPage)
else
{
return $middle
}
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
}
return $middle
}
try
{
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
$total = Get-TotalPageCount $startPage
}
catch
{
Write-Warning "Failed to count total number of pages, using default [$total]"
}
$startTime = [DateTime]::UtcNow
$remainingSeconds = ($total - $page) * 1.7
do
{
try
{
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
$html = Request-Page $page
$hasNextPage = $html.Contains('>next</a></li>')
$pos = $html.IndexOf('<table class="characterlist">')
if ($pos -lt 1)
{
if ($html.Contains('Please unban me'))
{
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Read-Host -Prompt 'Rate limited, plz unban...'
continue
}
else
{
Write-Host $html
Write-Error "This script needs updating"
exit -1
}
}
do
{
$pos = $html.IndexOf('<td data-label="Title"', $pos)
if ($pos -lt 0)
{
break
}
$pos = $html.IndexOf('<a href=', $pos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</a></td>', $pos)
$name = $html.Substring($pos + 1, $endPos - $pos - 1).Replace(' ', ' ').Trim()
$pos = $html.IndexOf('<td data-label="Type"', $endPos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</td>', $pos)
$type = $html.Substring($pos + 1, $endPos - $pos - 1)
if (($type -ieq 'Character') -or ($includeMecha -and ($type -ieq 'Mecha')))
{
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
{
Write-Host "Skipping $name"
continue
}
$result += $name
}
else
{
Write-Host "Skipping $name ($type)"
}
} until ($pos -lt 0)
$page++
$total = [Math]::Max($total, $response.pageInfo.lastPage)
if (($page - $startPage) -gt 60)
{
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
}
else
{
$remainingSeconds = ($total - $page) * 1.7
}
}
catch
{
Write-Host "Failed to fetch the page" -ForegroundColor Yellow
$hasNextPage = $false
}
} while ($hasNextPage)
Write-Host "Stopped on page $page"
Write-Progress -Activity "Downloading" -Completed
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append

View File

@ -2,12 +2,16 @@
<#
.SYNOPSIS
Script to crawl AniList for character names
.PARAMETER startPage
Start page number.
Default is 0.
.PARAMETER output
Output filename.
Default is names_anilist.txt.
#>
param(
[int]$startPage = 0,
[string]$output = 'names_anilist.txt'
)
@ -15,41 +19,104 @@ param(
#$ProgressPreference = "SilentlyContinue"
$requestTemplate = '{ "query": "query { Page (page: {0}, perPage: 50) { characters { name { full } } pageInfo { hasNextPage lastPage } } }" }'
$startTime = [DateTime]::UtcNow
$page = 0
$page = $startPage
$total = 2565
$result = @()
$hasNextPage = $false
function Request-Page($num)
{
$json = $requestTemplate.Replace('{0}', $num)
$response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" -TimeoutSec 30
Start-Sleep -Seconds 1.5
return $response.data.Page
}
function Get-TotalPageCount($start)
{
$min = $start
$max = $start + 1000
do
{
$response = Request-Page $max
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
$max += 1000
} while ($response.pageInfo.hasNextPage)
while ($min -lt $max)
{
$middle = $min + [int](($max - $min) / 2)
$response = Request-Page $middle
if ($response.pageInfo.hasNextPage)
{
$min = $middle
}
elseif ($response.characters.Count -eq 0)
{
$max = $middle
}
else
{
return $middle
}
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
}
return $middle
}
try
{
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
$total = Get-TotalPageCount $startPage
}
catch
{
Write-Warning "Failed to count total number of pages, using default [$total]"
}
$startTime = [DateTime]::UtcNow
$remainingSeconds = ($total - $page) * 1.7
do
{
try
{
$json = $requestTemplate.Replace('{0}', $page)
$response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json"
$response = $response.data.Page
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
$response = Request-Page $page
$hasNextPage = $response.pageInfo.hasNextPage
$chars = $response.characters
foreach ($char in $chars)
{
$result += $char.name.full.Trim()
$name = $char.name.full
if ($null -eq $name) { continue }
$name = $char.name.full.Replace(' ', ' ').Trim()
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
{
Write-Host "Skipping $name"
continue
}
$result += $name
}
$page++
$total = $response.pageInfo.lastPage
$remainingSeconds = ($total - $page) * 1.7
if ($page -gt 100)
$total = [Math]::Max($total, $response.pageInfo.lastPage)
if (($page - $startPage) -gt 60)
{
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / $page * ($total - $page)
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
}
else
{
$remainingSeconds = ($total - $page) * 1.7
}
Write-Progress Downloading -CurrentOperation "Page $page out of $total" -PercentComplete ($page * 100 / $total) -SecondsRemaining $remainingSeconds
Start-Sleep -Seconds 1
}
catch
{
Write-Host "Failed to request page $page"
Write-Error "Failed to request page $page`: $_"
$hasNextPage = $false
}
} while ($hasNextPage)
Write-Progress Downloading -Completed
Write-Host "Stopped on page $page"
Write-Progress -Activity "Downloading" -Completed
Write-Host 'Saving the results...'
'# https://anilist.co/search/characters' | Out-File -LiteralPath $output