mirror of
https://github.com/RPCS3/discord-bot.git
synced 2024-11-23 02:09:38 +00:00
update some scripts
This commit is contained in:
parent
deb8b5bbd2
commit
07b2bd68ae
@ -2,9 +2,9 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Script to crawl AniDB for character names
|
||||
.PARAMETER startletter
|
||||
Letter to start from (a-z). Useful for resuming.
|
||||
Default is a.
|
||||
.PARAMETER includeMecha
|
||||
If enabled, include mecha names in the list.
|
||||
Default is not enabled.
|
||||
.PARAMETER startpage
|
||||
Letter page to start from. Asume default number of rows per page.
|
||||
Default is 0.
|
||||
@ -13,100 +13,159 @@ Output filename.
|
||||
Default is names_anidb.txt.
|
||||
#>
|
||||
param(
|
||||
[char]$startletter = 'a',
|
||||
[switch]$includeMecha = $false,
|
||||
[int]$startpage = 0,
|
||||
[string]$output = 'names_anidb.txt'
|
||||
)
|
||||
|
||||
# disable extra noise for invoke-webrequest
|
||||
$ProgressPreference = "SilentlyContinue"
|
||||
#$ProgressPreference = "SilentlyContinue"
|
||||
|
||||
$page = $startpage
|
||||
$total = 3971
|
||||
$result = @()
|
||||
$hasNextPage = $false
|
||||
|
||||
# get anonymous sesssion
|
||||
Invoke-WebRequest "https://anidb.net" -SessionVariable 'Session' | Out-Null
|
||||
Start-Sleep -Seconds 2
|
||||
|
||||
$result = @()
|
||||
foreach ($letter in 'a'..'z')
|
||||
function Request-Page($num)
|
||||
{
|
||||
if ($letter -lt $startletter)
|
||||
{
|
||||
continue
|
||||
}
|
||||
# all template: https://anidb.net/character/?noalias=1&orderby.name=0.1&page=1&view=list
|
||||
# letter template: https://anidb.net/character/?char=a&noalias=1&orderby.name=0.1&page=1&view=list
|
||||
$url = "https://anidb.net/character/?noalias=1&orderby.name=0.1&page=$num&view=list"
|
||||
$response = Invoke-WebRequest $url -WebSession $Session
|
||||
#$links = @($response.links | Where-Object { $_.href -match '/character/\d+$' } | Where-Object { $_.outerHTML -match '^<a [^>]+>[^<][^\n]+</a>$' })
|
||||
Start-Sleep -Seconds 3
|
||||
return $response.Content
|
||||
}
|
||||
|
||||
$page = 0
|
||||
if ($letter -eq $startletter)
|
||||
{
|
||||
$page = $startpage
|
||||
}
|
||||
|
||||
$hasNextPage = $false
|
||||
function Get-TotalPageCount($start)
|
||||
{
|
||||
$min = $start
|
||||
$max = $start + 1000
|
||||
do
|
||||
{
|
||||
try
|
||||
$html = Request-Page $max
|
||||
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
|
||||
$max += 1000
|
||||
} while ($html.contains('>next</a></li>'))
|
||||
|
||||
while ($min -lt $max)
|
||||
{
|
||||
$middle = $min + [int](($max - $min) / 2)
|
||||
$html = Request-Page $middle
|
||||
if ($html.Contains('>next</a></li>'))
|
||||
{
|
||||
Write-Host "Requesting letter $letter, page $page..."
|
||||
$url = "https://anidb.net/character/?char=$letter&noalias=1&orderby.name=0.1&view=list&page=$page"
|
||||
$response = Invoke-WebRequest $url -WebSession $Session
|
||||
$html = $response.content
|
||||
$hasNextPage = $html.contains('>next</a></li>')
|
||||
$pos = $html.IndexOf('<table class="characterlist">')
|
||||
if ($pos -lt 1)
|
||||
{
|
||||
if ($html.Contains('Please unban me'))
|
||||
{
|
||||
Write-Host 'Saving the results...'
|
||||
'# https://anidb.net/character' | Out-File -LiteralPath $output
|
||||
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
|
||||
Read-Host -Prompt 'Rate limited, plz unban...'
|
||||
continue
|
||||
}
|
||||
else
|
||||
{
|
||||
Write-Host $html
|
||||
Write-Error "This script needs updating"
|
||||
exit -1
|
||||
}
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
$pos = $html.IndexOf('<td data-label="Title"', $pos)
|
||||
if ($pos -lt 0)
|
||||
{
|
||||
break
|
||||
}
|
||||
|
||||
$pos = $html.IndexOf('<a href=', $pos)
|
||||
$pos = $html.IndexOf('>', $pos)
|
||||
$endPos = $html.IndexOf('</a></td>', $pos)
|
||||
$name = $html.Substring($pos + 1, $endPos - $pos - 1).Trim()
|
||||
|
||||
$pos = $html.IndexOf('<td data-label="Type"', $endPos)
|
||||
$pos = $html.IndexOf('>', $pos)
|
||||
$endPos = $html.IndexOf('</td>', $pos)
|
||||
$type = $html.Substring($pos + 1, $endPos - $pos - 1)
|
||||
|
||||
if ($type -ieq 'Character') # consider adding Mecha
|
||||
{
|
||||
$result += $name
|
||||
}
|
||||
else
|
||||
{
|
||||
Write-Host "Skipped $name ($type)"
|
||||
}
|
||||
} until ($pos -lt 0)
|
||||
|
||||
$page++
|
||||
Start-Sleep -Seconds 2 # increase if needed
|
||||
$min = $middle
|
||||
}
|
||||
catch
|
||||
elseif ($html.Contains('<div class="container">No results.'))
|
||||
{
|
||||
Write-Host "Failed to fetch the page" -ForegroundColor Yellow
|
||||
$max = $middle
|
||||
}
|
||||
|
||||
} while ($hasNextPage)
|
||||
else
|
||||
{
|
||||
return $middle
|
||||
}
|
||||
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
|
||||
}
|
||||
return $middle
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
|
||||
$total = Get-TotalPageCount $startPage
|
||||
}
|
||||
catch
|
||||
{
|
||||
Write-Warning "Failed to count total number of pages, using default [$total]"
|
||||
}
|
||||
|
||||
$startTime = [DateTime]::UtcNow
|
||||
$remainingSeconds = ($total - $page) * 1.7
|
||||
|
||||
do
|
||||
{
|
||||
try
|
||||
{
|
||||
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
|
||||
$html = Request-Page $page
|
||||
$hasNextPage = $html.Contains('>next</a></li>')
|
||||
$pos = $html.IndexOf('<table class="characterlist">')
|
||||
if ($pos -lt 1)
|
||||
{
|
||||
if ($html.Contains('Please unban me'))
|
||||
{
|
||||
Write-Host 'Saving the results...'
|
||||
'# https://anidb.net/character' | Out-File -LiteralPath $output
|
||||
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
|
||||
Read-Host -Prompt 'Rate limited, plz unban...'
|
||||
continue
|
||||
}
|
||||
else
|
||||
{
|
||||
Write-Host $html
|
||||
Write-Error "This script needs updating"
|
||||
exit -1
|
||||
}
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
$pos = $html.IndexOf('<td data-label="Title"', $pos)
|
||||
if ($pos -lt 0)
|
||||
{
|
||||
break
|
||||
}
|
||||
|
||||
$pos = $html.IndexOf('<a href=', $pos)
|
||||
$pos = $html.IndexOf('>', $pos)
|
||||
$endPos = $html.IndexOf('</a></td>', $pos)
|
||||
$name = $html.Substring($pos + 1, $endPos - $pos - 1).Replace(' ', ' ').Trim()
|
||||
|
||||
$pos = $html.IndexOf('<td data-label="Type"', $endPos)
|
||||
$pos = $html.IndexOf('>', $pos)
|
||||
$endPos = $html.IndexOf('</td>', $pos)
|
||||
$type = $html.Substring($pos + 1, $endPos - $pos - 1)
|
||||
|
||||
if (($type -ieq 'Character') -or ($includeMecha -and ($type -ieq 'Mecha')))
|
||||
{
|
||||
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
|
||||
{
|
||||
Write-Host "Skipping $name"
|
||||
continue
|
||||
}
|
||||
|
||||
$result += $name
|
||||
}
|
||||
else
|
||||
{
|
||||
Write-Host "Skipping $name ($type)"
|
||||
}
|
||||
} until ($pos -lt 0)
|
||||
|
||||
$page++
|
||||
$total = [Math]::Max($total, $response.pageInfo.lastPage)
|
||||
if (($page - $startPage) -gt 60)
|
||||
{
|
||||
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
|
||||
}
|
||||
else
|
||||
{
|
||||
$remainingSeconds = ($total - $page) * 1.7
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
Write-Host "Failed to fetch the page" -ForegroundColor Yellow
|
||||
$hasNextPage = $false
|
||||
}
|
||||
} while ($hasNextPage)
|
||||
Write-Host "Stopped on page $page"
|
||||
Write-Progress -Activity "Downloading" -Completed
|
||||
|
||||
Write-Host 'Saving the results...'
|
||||
'# https://anidb.net/character' | Out-File -LiteralPath $output
|
||||
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
|
||||
|
@ -2,12 +2,16 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Script to crawl AniList for character names
|
||||
.PARAMETER startPage
|
||||
Start page number.
|
||||
Default is 0.
|
||||
.PARAMETER output
|
||||
Output filename.
|
||||
Default is names_anilist.txt.
|
||||
#>
|
||||
|
||||
param(
|
||||
[int]$startPage = 0,
|
||||
[string]$output = 'names_anilist.txt'
|
||||
)
|
||||
|
||||
@ -15,41 +19,104 @@ param(
|
||||
#$ProgressPreference = "SilentlyContinue"
|
||||
|
||||
$requestTemplate = '{ "query": "query { Page (page: {0}, perPage: 50) { characters { name { full } } pageInfo { hasNextPage lastPage } } }" }'
|
||||
$startTime = [DateTime]::UtcNow
|
||||
$page = 0
|
||||
$page = $startPage
|
||||
$total = 2565
|
||||
$result = @()
|
||||
$hasNextPage = $false
|
||||
|
||||
function Request-Page($num)
|
||||
{
|
||||
$json = $requestTemplate.Replace('{0}', $num)
|
||||
$response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" -TimeoutSec 30
|
||||
Start-Sleep -Seconds 1.5
|
||||
return $response.data.Page
|
||||
}
|
||||
|
||||
function Get-TotalPageCount($start)
|
||||
{
|
||||
$min = $start
|
||||
$max = $start + 1000
|
||||
do
|
||||
{
|
||||
$response = Request-Page $max
|
||||
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
|
||||
$max += 1000
|
||||
} while ($response.pageInfo.hasNextPage)
|
||||
|
||||
while ($min -lt $max)
|
||||
{
|
||||
$middle = $min + [int](($max - $min) / 2)
|
||||
$response = Request-Page $middle
|
||||
if ($response.pageInfo.hasNextPage)
|
||||
{
|
||||
$min = $middle
|
||||
}
|
||||
elseif ($response.characters.Count -eq 0)
|
||||
{
|
||||
$max = $middle
|
||||
}
|
||||
else
|
||||
{
|
||||
return $middle
|
||||
}
|
||||
Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]"
|
||||
}
|
||||
return $middle
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0
|
||||
$total = Get-TotalPageCount $startPage
|
||||
}
|
||||
catch
|
||||
{
|
||||
Write-Warning "Failed to count total number of pages, using default [$total]"
|
||||
}
|
||||
|
||||
$startTime = [DateTime]::UtcNow
|
||||
$remainingSeconds = ($total - $page) * 1.7
|
||||
do
|
||||
{
|
||||
try
|
||||
{
|
||||
$json = $requestTemplate.Replace('{0}', $page)
|
||||
$response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json"
|
||||
$response = $response.data.Page
|
||||
Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds
|
||||
$response = Request-Page $page
|
||||
$hasNextPage = $response.pageInfo.hasNextPage
|
||||
$chars = $response.characters
|
||||
foreach ($char in $chars)
|
||||
{
|
||||
$result += $char.name.full.Trim()
|
||||
$name = $char.name.full
|
||||
if ($null -eq $name) { continue }
|
||||
|
||||
$name = $char.name.full.Replace(' ', ' ').Trim()
|
||||
if (($name.Length -lt 2) -or ("$name" -match '^\d+$'))
|
||||
{
|
||||
Write-Host "Skipping $name"
|
||||
continue
|
||||
}
|
||||
$result += $name
|
||||
}
|
||||
|
||||
$page++
|
||||
$total = $response.pageInfo.lastPage
|
||||
$remainingSeconds = ($total - $page) * 1.7
|
||||
if ($page -gt 100)
|
||||
$total = [Math]::Max($total, $response.pageInfo.lastPage)
|
||||
if (($page - $startPage) -gt 60)
|
||||
{
|
||||
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / $page * ($total - $page)
|
||||
$remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page)
|
||||
}
|
||||
else
|
||||
{
|
||||
$remainingSeconds = ($total - $page) * 1.7
|
||||
}
|
||||
Write-Progress Downloading -CurrentOperation "Page $page out of $total" -PercentComplete ($page * 100 / $total) -SecondsRemaining $remainingSeconds
|
||||
Start-Sleep -Seconds 1
|
||||
}
|
||||
catch
|
||||
{
|
||||
Write-Host "Failed to request page $page"
|
||||
Write-Error "Failed to request page $page`: $_"
|
||||
$hasNextPage = $false
|
||||
}
|
||||
} while ($hasNextPage)
|
||||
Write-Progress Downloading -Completed
|
||||
Write-Host "Stopped on page $page"
|
||||
Write-Progress -Activity "Downloading" -Completed
|
||||
|
||||
Write-Host 'Saving the results...'
|
||||
'# https://anilist.co/search/characters' | Out-File -LiteralPath $output
|
||||
|
Loading…
Reference in New Issue
Block a user