From 07b2bd68aebf32d76cb64799f23d8114329a8270 Mon Sep 17 00:00:00 2001 From: 13xforever Date: Sat, 3 Dec 2022 11:55:59 +0500 Subject: [PATCH] update some scripts --- get-names_anidb.ps1 | 213 +++++++++++++++++++++++++++--------------- get-names_anilist.ps1 | 95 ++++++++++++++++--- 2 files changed, 217 insertions(+), 91 deletions(-) diff --git a/get-names_anidb.ps1 b/get-names_anidb.ps1 index 78cf9459..f073cf74 100644 --- a/get-names_anidb.ps1 +++ b/get-names_anidb.ps1 @@ -2,9 +2,9 @@ <# .SYNOPSIS Script to crawl AniDB for character names -.PARAMETER startletter -Letter to start from (a-z). Useful for resuming. -Default is a. +.PARAMETER includeMecha +If enabled, include mecha names in the list. +Default is not enabled. .PARAMETER startpage Letter page to start from. Asume default number of rows per page. Default is 0. @@ -13,100 +13,159 @@ Output filename. Default is names_anidb.txt. #> param( - [char]$startletter = 'a', + [switch]$includeMecha = $false, [int]$startpage = 0, [string]$output = 'names_anidb.txt' ) # disable extra noise for invoke-webrequest -$ProgressPreference = "SilentlyContinue" +#$ProgressPreference = "SilentlyContinue" + +$page = $startpage +$total = 3971 +$result = @() +$hasNextPage = $false # get anonymous sesssion Invoke-WebRequest "https://anidb.net" -SessionVariable 'Session' | Out-Null Start-Sleep -Seconds 2 -$result = @() -foreach ($letter in 'a'..'z') +function Request-Page($num) { - if ($letter -lt $startletter) - { - continue - } + # all template: https://anidb.net/character/?noalias=1&orderby.name=0.1&page=1&view=list + # letter template: https://anidb.net/character/?char=a&noalias=1&orderby.name=0.1&page=1&view=list + $url = "https://anidb.net/character/?noalias=1&orderby.name=0.1&page=$num&view=list" + $response = Invoke-WebRequest $url -WebSession $Session + #$links = @($response.links | Where-Object { $_.href -match '/character/\d+$' } | Where-Object { $_.outerHTML -match '^]+>[^<][^\n]+$' }) + Start-Sleep -Seconds 3 + return $response.Content +} - $page = 0 - if ($letter -eq $startletter) - { - $page = $startpage - } - - $hasNextPage = $false +function Get-TotalPageCount($start) +{ + $min = $start + $max = $start + 1000 do { - try + $html = Request-Page $max + Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]" + $max += 1000 + } while ($html.contains('>next')) + + while ($min -lt $max) + { + $middle = $min + [int](($max - $min) / 2) + $html = Request-Page $middle + if ($html.Contains('>next')) { - Write-Host "Requesting letter $letter, page $page..." - $url = "https://anidb.net/character/?char=$letter&noalias=1&orderby.name=0.1&view=list&page=$page" - $response = Invoke-WebRequest $url -WebSession $Session - $html = $response.content - $hasNextPage = $html.contains('>next') - $pos = $html.IndexOf('') - if ($pos -lt 1) - { - if ($html.Contains('Please unban me')) - { - Write-Host 'Saving the results...' - '# https://anidb.net/character' | Out-File -LiteralPath $output - $result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append - Read-Host -Prompt 'Rate limited, plz unban...' - continue - } - else - { - Write-Host $html - Write-Error "This script needs updating" - exit -1 - } - } - - do - { - $pos = $html.IndexOf('', $pos) - $name = $html.Substring($pos + 1, $endPos - $pos - 1).Trim() - - $pos = $html.IndexOf('', $pos) - $type = $html.Substring($pos + 1, $endPos - $pos - 1) - - if ($type -ieq 'Character') # consider adding Mecha - { - $result += $name - } - else - { - Write-Host "Skipped $name ($type)" - } - } until ($pos -lt 0) - - $page++ - Start-Sleep -Seconds 2 # increase if needed + $min = $middle } - catch + elseif ($html.Contains('
No results.')) { - Write-Host "Failed to fetch the page" -ForegroundColor Yellow + $max = $middle } - - } while ($hasNextPage) + else + { + return $middle + } + Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]" + } + return $middle } +try +{ + Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0 + $total = Get-TotalPageCount $startPage +} +catch +{ + Write-Warning "Failed to count total number of pages, using default [$total]" +} + +$startTime = [DateTime]::UtcNow +$remainingSeconds = ($total - $page) * 1.7 + +do +{ + try + { + Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds + $html = Request-Page $page + $hasNextPage = $html.Contains('>next') + $pos = $html.IndexOf('
', $pos) - $endPos = $html.IndexOf('', $pos) - $endPos = $html.IndexOf('
') + if ($pos -lt 1) + { + if ($html.Contains('Please unban me')) + { + Write-Host 'Saving the results...' + '# https://anidb.net/character' | Out-File -LiteralPath $output + $result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append + Read-Host -Prompt 'Rate limited, plz unban...' + continue + } + else + { + Write-Host $html + Write-Error "This script needs updating" + exit -1 + } + } + + do + { + $pos = $html.IndexOf('', $pos) + $name = $html.Substring($pos + 1, $endPos - $pos - 1).Replace(' ', ' ').Trim() + + $pos = $html.IndexOf('', $pos) + $type = $html.Substring($pos + 1, $endPos - $pos - 1) + + if (($type -ieq 'Character') -or ($includeMecha -and ($type -ieq 'Mecha'))) + { + if (($name.Length -lt 2) -or ("$name" -match '^\d+$')) + { + Write-Host "Skipping $name" + continue + } + + $result += $name + } + else + { + Write-Host "Skipping $name ($type)" + } + } until ($pos -lt 0) + + $page++ + $total = [Math]::Max($total, $response.pageInfo.lastPage) + if (($page - $startPage) -gt 60) + { + $remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page) + } + else + { + $remainingSeconds = ($total - $page) * 1.7 + } + } + catch + { + Write-Host "Failed to fetch the page" -ForegroundColor Yellow + $hasNextPage = $false + } +} while ($hasNextPage) +Write-Host "Stopped on page $page" +Write-Progress -Activity "Downloading" -Completed + Write-Host 'Saving the results...' '# https://anidb.net/character' | Out-File -LiteralPath $output $result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append diff --git a/get-names_anilist.ps1 b/get-names_anilist.ps1 index 0cbd7b46..ae00c8cc 100644 --- a/get-names_anilist.ps1 +++ b/get-names_anilist.ps1 @@ -2,12 +2,16 @@ <# .SYNOPSIS Script to crawl AniList for character names +.PARAMETER startPage +Start page number. +Default is 0. .PARAMETER output Output filename. Default is names_anilist.txt. #> param( + [int]$startPage = 0, [string]$output = 'names_anilist.txt' ) @@ -15,41 +19,104 @@ param( #$ProgressPreference = "SilentlyContinue" $requestTemplate = '{ "query": "query { Page (page: {0}, perPage: 50) { characters { name { full } } pageInfo { hasNextPage lastPage } } }" }' -$startTime = [DateTime]::UtcNow -$page = 0 +$page = $startPage +$total = 2565 $result = @() $hasNextPage = $false + +function Request-Page($num) +{ + $json = $requestTemplate.Replace('{0}', $num) + $response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" -TimeoutSec 30 + Start-Sleep -Seconds 1.5 + return $response.data.Page +} + +function Get-TotalPageCount($start) +{ + $min = $start + $max = $start + 1000 + do + { + $response = Request-Page $max + Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]" + $max += 1000 + } while ($response.pageInfo.hasNextPage) + + while ($min -lt $max) + { + $middle = $min + [int](($max - $min) / 2) + $response = Request-Page $middle + if ($response.pageInfo.hasNextPage) + { + $min = $middle + } + elseif ($response.characters.Count -eq 0) + { + $max = $middle + } + else + { + return $middle + } + Write-Progress -Activity "Downloading" -Status "Counting pages... [$min - $max]" + } + return $middle +} + +try +{ + Write-Progress -Activity "Downloading" -Status "Counting pages..." -PercentComplete 0 + $total = Get-TotalPageCount $startPage +} +catch +{ + Write-Warning "Failed to count total number of pages, using default [$total]" +} + +$startTime = [DateTime]::UtcNow +$remainingSeconds = ($total - $page) * 1.7 do { try { - $json = $requestTemplate.Replace('{0}', $page) - $response = Invoke-RestMethod 'https://graphql.anilist.co' -Method Post -Body $json -ContentType "application/json" - $response = $response.data.Page + Write-Progress -Activity "Downloading" -Status "Page $page of $total" -PercentComplete ($page * 100.0 / $total) -SecondsRemaining $remainingSeconds + $response = Request-Page $page $hasNextPage = $response.pageInfo.hasNextPage $chars = $response.characters foreach ($char in $chars) { - $result += $char.name.full.Trim() + $name = $char.name.full + if ($null -eq $name) { continue } + + $name = $char.name.full.Replace(' ', ' ').Trim() + if (($name.Length -lt 2) -or ("$name" -match '^\d+$')) + { + Write-Host "Skipping $name" + continue + } + $result += $name } $page++ - $total = $response.pageInfo.lastPage - $remainingSeconds = ($total - $page) * 1.7 - if ($page -gt 100) + $total = [Math]::Max($total, $response.pageInfo.lastPage) + if (($page - $startPage) -gt 60) { - $remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / $page * ($total - $page) + $remainingSeconds = ([DateTime]::UtcNow - $startTime).TotalSeconds / ($page - $startPage) * ($total - $page) + } + else + { + $remainingSeconds = ($total - $page) * 1.7 } - Write-Progress Downloading -CurrentOperation "Page $page out of $total" -PercentComplete ($page * 100 / $total) -SecondsRemaining $remainingSeconds - Start-Sleep -Seconds 1 } catch { - Write-Host "Failed to request page $page" + Write-Error "Failed to request page $page`: $_" $hasNextPage = $false } } while ($hasNextPage) -Write-Progress Downloading -Completed +Write-Host "Stopped on page $page" +Write-Progress -Activity "Downloading" -Completed Write-Host 'Saving the results...' '# https://anilist.co/search/characters' | Out-File -LiteralPath $output
', $pos) + $endPos = $html.IndexOf('', $pos) + $endPos = $html.IndexOf('