add anidb character name dump (no mecha)

This commit is contained in:
13xforever 2021-03-10 00:56:43 +05:00
parent 67667e6855
commit 568e31eb48
3 changed files with 80371 additions and 2 deletions

View File

@ -1,7 +1,6 @@
using System;
using System.Diagnostics;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Net.Http;
using System.Text;
@ -18,7 +17,7 @@ using Microsoft.EntityFrameworkCore;
namespace CompatBot.Commands
{
[Group("fortune")]
[Group("fortune"), Aliases("fortunes")]
[Description("Gives you a fortune once a day")]
internal sealed class Fortune : BaseCommandModuleCustom
{

114
get-names_anidb.ps1 Normal file
View File

@ -0,0 +1,114 @@
#!/usr/bin/pwsh
<#
.SYNOPSIS
Script to crawl AniDB for character names
.PARAMETER startletter
Letter to start from (a-z). Useful for resuming.
Default is a.
.PARAMETER startpage
Letter page to start from. Asume default number of rows per page.
Default is 0.
.PARAMETER output
Output filename.
Default is names_anidb.txt.
#>
param(
[char]$startletter = 'a',
[int]$startpage = 0,
[string]$output = 'names_anidb.txt'
)
# disable extra noise for invoke-webrequest
$ProgressPreference = "SilentlyContinue"
# get anonymous sesssion
Invoke-WebRequest "https://anidb.net" -SessionVariable 'Session' | Out-Null
Start-Sleep -Seconds 2
$result = @()
foreach ($letter in 'a'..'z')
{
if ($letter -lt $startletter)
{
continue
}
$page = 0
if ($letter -eq $startletter)
{
$page = $startpage
}
$hasNextPage = $false
do
{
try
{
Write-Host "Requesting letter $letter, page $page..."
$url = "https://anidb.net/character/?char=$letter&noalias=1&orderby.name=0.1&view=list&page=$page"
$response = Invoke-WebRequest $url -WebSession $Session
$html = $response.content
$hasNextPage = $html.contains('>next</a></li>')
$pos = $html.IndexOf('<table class="characterlist">')
if ($pos -lt 1)
{
if ($html.Contains('Please unban me'))
{
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Read-Host -Prompt 'Rate limited, plz unban...'
continue
}
else
{
Write-Host $html
Write-Error "This script needs updating"
exit -1
}
}
do
{
$pos = $html.IndexOf('<td data-label="Title"', $pos)
if ($pos -lt 0)
{
break
}
$pos = $html.IndexOf('<a href=', $pos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</a></td>', $pos)
$name = $html.Substring($pos + 1, $endPos - $pos - 1).Trim()
$pos = $html.IndexOf('<td data-label="Type"', $endPos)
$pos = $html.IndexOf('>', $pos)
$endPos = $html.IndexOf('</td>', $pos)
$type = $html.Substring($pos + 1, $endPos - $pos - 1)
if ($type -ieq 'Character') # consider adding Mecha
{
$result += $name
}
else
{
Write-Host "Skipped $name ($type)"
}
} until ($pos -lt 0)
$page++
Start-Sleep -Seconds 2 # increase if needed
}
catch
{
Write-Host "Failed to fetch the page" -ForegroundColor Yellow
}
} while ($hasNextPage)
}
Write-Host 'Saving the results...'
'# https://anidb.net/character' | Out-File -LiteralPath $output
$result | Sort-Object | Get-Unique | Out-File -LiteralPath $output -Append
Write-Host 'Done.'

80256
names_anidb.txt Normal file

File diff suppressed because it is too large Load Diff