mirror of
https://github.com/RPCS3/discord-bot.git
synced 2026-01-31 01:25:22 +01:00
multi-threaded log parsing
This commit is contained in:
337
CompatBot/Utils/Extensions/StringUtils.cs
Normal file
337
CompatBot/Utils/Extensions/StringUtils.cs
Normal file
@@ -0,0 +1,337 @@
|
||||
using System;
|
||||
using System.Buffers;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using DuoVia.FuzzyStrings;
|
||||
using HomoglyphConverter;
|
||||
using Microsoft.Extensions.Caching.Memory;
|
||||
|
||||
namespace CompatBot.Utils
|
||||
{
|
||||
public static class StringUtils
|
||||
{
|
||||
private static readonly Encoding Latin8BitEncoding = Encoding.GetEncodings()
|
||||
.FirstOrDefault(e => e.CodePage == 1250 || e.CodePage == 1252 || e.CodePage == 28591)?
|
||||
.GetEncoding()
|
||||
?? Encoding.ASCII;
|
||||
private static readonly Encoding Utf8 = new UTF8Encoding(false);
|
||||
private static readonly MemoryCache FuzzyPairCache = new MemoryCache(new MemoryCacheOptions {ExpirationScanFrequency = TimeSpan.FromMinutes(10)});
|
||||
private const char StrikeThroughChar = '\u0336'; // 0x0335 = short dash, 0x0336 = long dash, 0x0337 = short slash, 0x0338 = long slash
|
||||
|
||||
private static readonly HashSet<char> SpaceCharacters = new HashSet<char>
|
||||
{
|
||||
'\u00a0',
|
||||
'\u2002', '\u2003', '\u2004', '\u2005', '\u2006',
|
||||
'\u2007', '\u2008', '\u2009', '\u200a', '\u200b',
|
||||
'\u200c', '\u200d', '\u200e', '\u200f',
|
||||
'\u2028', '\u2029', '\u202a', '\u202b', '\u202c',
|
||||
'\u202c', '\u202d', '\u202e', '\u202f',
|
||||
'\u205f', '\u2060', '\u2061', '\u2062', '\u2063',
|
||||
'\u2064', '\u2065', '\u2066', '\u2067', '\u2068',
|
||||
'\u2069', '\u206a', '\u206b', '\u206c', '\u206d',
|
||||
'\u206e', '\u206f',
|
||||
'\u3000', '\u303f',
|
||||
};
|
||||
|
||||
public static string StripMarks(this string str)
|
||||
{
|
||||
return str.Replace("(R)", "", StringComparison.InvariantCultureIgnoreCase)
|
||||
.Replace("®", "", StringComparison.InvariantCultureIgnoreCase)
|
||||
.Replace("(TM)", "", StringComparison.InvariantCultureIgnoreCase)
|
||||
.Replace("™", "", StringComparison.InvariantCultureIgnoreCase);
|
||||
}
|
||||
|
||||
public static string StripQuotes(this string str)
|
||||
{
|
||||
if (str == null || str.Length < 2)
|
||||
return str;
|
||||
|
||||
if (str.StartsWith('"') && str.EndsWith('"'))
|
||||
return str.Substring(1, str.Length - 2);
|
||||
return str;
|
||||
}
|
||||
|
||||
public static string TrimEager(this string str)
|
||||
{
|
||||
if (string.IsNullOrEmpty(str))
|
||||
return str;
|
||||
|
||||
int start, end;
|
||||
for (start = 0; start < str.Length; start++)
|
||||
{
|
||||
if (char.IsWhiteSpace(str, start) || IsFormat(str[start]))
|
||||
continue;
|
||||
|
||||
if (char.IsHighSurrogate(str, start)
|
||||
&& char.GetUnicodeCategory(str, start) == UnicodeCategory.OtherNotAssigned
|
||||
&& str[start] >= 0xdb40) // this will check if the surrogate pair is >= E0000 (see https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF)
|
||||
continue;
|
||||
|
||||
if (char.IsLowSurrogate(str, start))
|
||||
continue;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
for (end = str.Length - 1; end >= start; end--)
|
||||
{
|
||||
if (char.IsWhiteSpace(str, end) || IsFormat(str[end]))
|
||||
continue;
|
||||
|
||||
if (char.IsLowSurrogate(str, end)
|
||||
&& end > start
|
||||
&& char.IsHighSurrogate(str, end - 1)
|
||||
&& char.GetUnicodeCategory(str, end - 1) == UnicodeCategory.OtherNotAssigned
|
||||
&& str[end-1] >= 0xdb40)
|
||||
continue;
|
||||
|
||||
if (char.IsHighSurrogate(str, end)
|
||||
&& char.GetUnicodeCategory(str, end) == UnicodeCategory.OtherNotAssigned
|
||||
&& str[end] >= 0xdb40)
|
||||
continue;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return CreateTrimmedString(str, start, end);
|
||||
}
|
||||
|
||||
public static string AsString(this ReadOnlySequence<byte> buffer, Encoding encoding = null)
|
||||
{
|
||||
encoding = encoding ?? Latin8BitEncoding;
|
||||
if (buffer.IsSingleSegment)
|
||||
return encoding.GetString(buffer.First.Span);
|
||||
|
||||
void Splice(Span<char> span, ReadOnlySequence<byte> sequence)
|
||||
{
|
||||
foreach (var segment in sequence)
|
||||
{
|
||||
encoding.GetChars(segment.Span, span);
|
||||
span = span.Slice(segment.Length);
|
||||
}
|
||||
}
|
||||
return string.Create((int)buffer.Length, buffer, Splice);
|
||||
}
|
||||
|
||||
public static string ToUtf8(this string str)
|
||||
{
|
||||
return Utf8.GetString(Latin8BitEncoding.GetBytes(str));
|
||||
}
|
||||
|
||||
public static string ToLatin8BitEncoding(this string str)
|
||||
{
|
||||
try
|
||||
{
|
||||
return Latin8BitEncoding.GetString(Utf8.GetBytes(str));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
Config.Log.Error(e, $"Failed to decode string from {Latin8BitEncoding.EncodingName} to {Utf8.EncodingName}");
|
||||
return str;
|
||||
}
|
||||
}
|
||||
|
||||
public static string GetSuffix(long num) => num == 1 ? "" : "s";
|
||||
|
||||
public static string FixSpaces(this string text) => text?.Replace(" ", " \u200d");
|
||||
|
||||
public static int GetVisibleLength(this string s)
|
||||
{
|
||||
if (string.IsNullOrEmpty(s))
|
||||
return 0;
|
||||
|
||||
var c = 0;
|
||||
var e = StringInfo.GetTextElementEnumerator(s.Normalize());
|
||||
while (e.MoveNext())
|
||||
{
|
||||
var strEl = e.GetTextElement();
|
||||
if (char.IsControl(strEl[0]) || char.GetUnicodeCategory(strEl[0]) == UnicodeCategory.Format || strEl[0] == StrikeThroughChar)
|
||||
continue;
|
||||
|
||||
c++;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
public static string TrimVisible(this string s, int maxLength)
|
||||
{
|
||||
if (string.IsNullOrEmpty(s))
|
||||
return s;
|
||||
|
||||
if (maxLength < 1)
|
||||
throw new ArgumentException("Max length can't be less than 1", nameof(maxLength));
|
||||
|
||||
if (s.Length <= maxLength)
|
||||
return s;
|
||||
|
||||
var c = 0;
|
||||
var e = StringInfo.GetTextElementEnumerator(s.Normalize());
|
||||
var result = new StringBuilder();
|
||||
while (e.MoveNext() && c < maxLength-1)
|
||||
{
|
||||
var strEl = e.GetTextElement();
|
||||
result.Append(strEl);
|
||||
if (char.IsControl(strEl[0]) || char.GetUnicodeCategory(strEl[0]) == UnicodeCategory.Format || strEl[0] == StrikeThroughChar)
|
||||
continue;
|
||||
|
||||
c++;
|
||||
}
|
||||
return result.Append("…").ToString();
|
||||
}
|
||||
|
||||
public static string PadLeftVisible(this string s, int totalWidth, char padding = ' ')
|
||||
{
|
||||
s = s ?? "";
|
||||
var valueWidth = s.GetVisibleLength();
|
||||
var diff = s.Length - valueWidth;
|
||||
totalWidth += diff;
|
||||
return s.PadLeft(totalWidth, padding);
|
||||
}
|
||||
|
||||
public static string PadRightVisible(this string s, int totalWidth, char padding = ' ')
|
||||
{
|
||||
s = s ?? "";
|
||||
var valueWidth = s.GetVisibleLength();
|
||||
var diff = s.Length - valueWidth;
|
||||
totalWidth += diff;
|
||||
return s.PadRight(totalWidth, padding);
|
||||
}
|
||||
|
||||
public static string StrikeThrough(this string str)
|
||||
{
|
||||
if (string.IsNullOrEmpty(str))
|
||||
return str;
|
||||
|
||||
var result = new StringBuilder(str.Length*2);
|
||||
result.Append(StrikeThroughChar);
|
||||
foreach (var c in str)
|
||||
{
|
||||
result.Append(c);
|
||||
if (char.IsLetterOrDigit(c) || char.IsLowSurrogate(c))
|
||||
result.Append(StrikeThroughChar);
|
||||
}
|
||||
return result.ToString(0, result.Length-1);
|
||||
}
|
||||
|
||||
public static string GetMoons(decimal? stars)
|
||||
{
|
||||
if (!stars.HasValue)
|
||||
return null;
|
||||
|
||||
var fullStars = (int)stars;
|
||||
var halfStar = (int)Math.Round((stars.Value - fullStars)*4, MidpointRounding.ToEven);
|
||||
var noStars = 5 - (halfStar > 0 && halfStar <= 4 ? 1 : 0) - fullStars;
|
||||
var result = "";
|
||||
for (var i = 0; i < fullStars; i++)
|
||||
result += "🌕";
|
||||
|
||||
if (halfStar == 4)
|
||||
{
|
||||
if (new Random().Next(100) == 69)
|
||||
result += "🌝";
|
||||
else
|
||||
result += "🌕";
|
||||
}
|
||||
else if (halfStar == 3)
|
||||
result += "🌖";
|
||||
else if (halfStar == 2)
|
||||
result += "🌗";
|
||||
else if (halfStar == 1)
|
||||
result += "🌘";
|
||||
|
||||
for (var i = 0; i < noStars; i++)
|
||||
{
|
||||
if (i == 0 && halfStar == 0 && new Random().Next(100) == 69)
|
||||
result += "🌚";
|
||||
else
|
||||
result += "🌑";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static string GetStars(decimal? stars)
|
||||
{
|
||||
if (!stars.HasValue)
|
||||
return null;
|
||||
|
||||
var fullStars = (int)Math.Round(stars.Value, MidpointRounding.ToEven);
|
||||
var noStars = 5 - fullStars;
|
||||
var result = "";
|
||||
for (var i = 0; i < fullStars; i++)
|
||||
result += "★";
|
||||
for (var i = 0; i < noStars; i++)
|
||||
result += "☆";
|
||||
return result;
|
||||
}
|
||||
|
||||
private static bool IsFormat(char c) => SpaceCharacters.Contains(c);
|
||||
|
||||
private static string CreateTrimmedString(string str, int start, int end)
|
||||
{
|
||||
var len = end - start + 1;
|
||||
if (len == str.Length)
|
||||
return str;
|
||||
|
||||
return len == 0 ? "" : str.Substring(start, len);
|
||||
}
|
||||
|
||||
internal static string GetAcronym(this string str)
|
||||
{
|
||||
if (string.IsNullOrEmpty(str))
|
||||
return str;
|
||||
|
||||
var result = "";
|
||||
bool previousWasLetter = false;
|
||||
foreach (var c in str)
|
||||
{
|
||||
var isLetter = char.IsLetterOrDigit(c);
|
||||
if (isLetter && !previousWasLetter)
|
||||
result += c;
|
||||
previousWasLetter = isLetter;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
internal static double GetFuzzyCoefficientCached(this string strA, string strB)
|
||||
{
|
||||
strA = strA?.ToLowerInvariant() ?? "";
|
||||
strB = strB?.ToLowerInvariant() ?? "";
|
||||
var cacheKey = GetFuzzyCacheKey(strA, strB);
|
||||
if (!FuzzyPairCache.TryGetValue(cacheKey, out FuzzyCacheValue match)
|
||||
|| strA != match.StrA
|
||||
|| strB != match.StrB)
|
||||
match = new FuzzyCacheValue
|
||||
{
|
||||
StrA = strA,
|
||||
StrB = strB,
|
||||
Coefficient = Normalizer.ToCanonicalForm(strA).GetScoreWithAcronym(Normalizer.ToCanonicalForm(strB)),
|
||||
};
|
||||
FuzzyPairCache.Set(cacheKey, match);
|
||||
return match.Coefficient;
|
||||
}
|
||||
|
||||
private static double GetScoreWithAcronym(this string strA, string strB)
|
||||
{
|
||||
return Math.Max(
|
||||
strA.DiceCoefficient(strB),
|
||||
strA.DiceCoefficient(strB.GetAcronym().ToLowerInvariant())
|
||||
);
|
||||
}
|
||||
|
||||
private static (long, int) GetFuzzyCacheKey(string strA, string strB)
|
||||
{
|
||||
var hashPair = (((long) (strA.GetHashCode())) << (sizeof(int) * 8)) | (((long) strB.GetHashCode()) & ((long) uint.MaxValue));
|
||||
var lengthPair = (strA.Length << (sizeof(short) * 8)) | (strB.Length & ushort.MaxValue);
|
||||
return (hashPair, lengthPair);
|
||||
}
|
||||
|
||||
private class FuzzyCacheValue
|
||||
{
|
||||
public string StrA;
|
||||
public string StrB;
|
||||
public double Coefficient;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user