Re-added .google search result scraper and set it as the default again. Also added versioning to searches.yml as it was forgotten previously

This commit is contained in:
Kwoth
2022-06-14 13:18:35 +02:00
parent f42deda3e2
commit 64b2a46c95
13 changed files with 187 additions and 76 deletions

View File

@@ -1,4 +1,5 @@
using NadekoBot.Modules.Searches.Youtube;
using NadekoBot.Modules.Searches.GoogleScrape;
using NadekoBot.Modules.Searches.Youtube;
namespace NadekoBot.Modules.Searches;
@@ -12,10 +13,12 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
private readonly YtdlYoutubeSearchService _ytdl;
private readonly YoutubeDataApiSearchService _ytdata;
private readonly InvidiousYtSearchService _iYtSs;
private readonly GoogleScrapeService _gscs;
public DefaultSearchServiceFactory(
SearchesConfigService scs,
GoogleSearchService gss,
GoogleScrapeService gscs,
SearxSearchService sss,
YtdlpYoutubeSearchService ytdlp,
YtdlYoutubeSearchService ytdl,
@@ -25,6 +28,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
_scs = scs;
_sss = sss;
_gss = gss;
_gscs = gscs;
_iYtSs = iYtSs;
_ytdlp = ytdlp;
@@ -36,6 +40,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
=> _scs.Data.WebSearchEngine switch
{
WebSearchEngine.Google => _gss,
WebSearchEngine.Google_Scrape => _gscs,
WebSearchEngine.Searx => _sss,
_ => _gss
};

View File

@@ -39,7 +39,7 @@ public sealed class GoogleSearchService : SearchServiceBase, INService
return result;
}
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string query)
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string? query)
{
ArgumentNullException.ThrowIfNull(query);

View File

@@ -1,62 +1,121 @@
// using AngleSharp.Html.Dom;
// using MorseCode.ITask;
// using NadekoBot.Modules.Searches.Common;
//
// namespace NadekoBot.Modules.Searches.GoogleScrape;
//
// public sealed class GoogleScrapeService : SearchServiceBase
// {
// public override async ITask<GoogleSearchResultData> SearchAsync(string query)
// {
// ArgumentNullException.ThrowIfNull(query);
//
// query = Uri.EscapeDataString(query)?.Replace(' ', '+');
//
// var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
//
// using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
// msg.Headers.Add("User-Agent",
// "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
// msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
//
// using var http = _httpFactory.CreateClient();
// http.DefaultRequestHeaders.Clear();
//
// using var response = await http.SendAsync(msg);
// await using var content = await response.Content.ReadAsStreamAsync();
//
// using var document = await _googleParser.ParseDocumentAsync(content);
// var elems = document.QuerySelectorAll("div.g > div > div");
//
// var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault();
// var totalResults = resultsElem?.TextContent;
// //var time = resultsElem.Children.FirstOrDefault()?.TextContent
// //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
// if (!elems.Any())
// return default;
//
// var results = elems.Select(elem =>
// {
// var children = elem.Children.ToList();
// if (children.Count < 2)
// return null;
//
// var href = (children[0].QuerySelector("a") as IHtmlAnchorElement)?.Href;
// var name = children[0].QuerySelector("h3")?.TextContent;
//
// if (href is null || name is null)
// return null;
//
// var txt = children[1].TextContent;
//
// if (string.IsNullOrWhiteSpace(txt))
// return null;
//
// return new GoogleSearchResult(name, href, txt);
// })
// .Where(x => x is not null)
// .ToList();
//
// return new(results.AsReadOnly(), fullQueryLink, totalResults);
// }
// }
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using MorseCode.ITask;
namespace NadekoBot.Modules.Searches.GoogleScrape;
public sealed class GoogleScrapeService : SearchServiceBase, INService
{
private static readonly HtmlParser _googleParser = new(new()
{
IsScripting = false,
IsEmbedded = false,
IsSupportingProcessingInstructions = false,
IsKeepingSourceReferences = false,
IsNotSupportingFrames = true
});
private readonly IHttpClientFactory _httpFactory;
public GoogleScrapeService(IHttpClientFactory httpClientFactory)
=> _httpFactory = httpClientFactory;
public override async ITask<ISearchResult?> SearchAsync(string? query)
{
ArgumentNullException.ThrowIfNull(query);
query = Uri.EscapeDataString(query)?.Replace(' ', '+');
var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
msg.Headers.Add("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
using var http = _httpFactory.CreateClient();
http.DefaultRequestHeaders.Clear();
using var response = await http.SendAsync(msg);
await using var content = await response.Content.ReadAsStreamAsync();
using var document = await _googleParser.ParseDocumentAsync(content);
var elems = document.QuerySelectorAll("div.g, div.mnr-c > div > div");
var resultsElem = document.QuerySelector("#result-stats");
var resultsArr = resultsElem?.TextContent.Split("results");
var totalResults = resultsArr?.Length is null or 0
? null
: resultsArr[0];
var time = resultsArr is null or {Length: < 2}
? null
: resultsArr[1]
.Replace("(", string.Empty)
.Replace("seconds)", string.Empty);
//var time = resultsElem.Children.FirstOrDefault()?.TextContent
//^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
if (!elems.Any())
return default;
var results = elems.Select(elem =>
{
var aTag = elem.QuerySelector("a");
if (aTag is null)
return null;
var url = ((IHtmlAnchorElement)aTag).Href;
var title = aTag.QuerySelector("h3")?.TextContent;
var txt = aTag.ParentElement
?.NextElementSibling
?.QuerySelector("span")
?.TextContent
.StripHtml()
?? elem
?.QuerySelectorAll("span")
.Skip(3)
.FirstOrDefault()
?.TextContent
.StripHtml();
// .Select(x => x.TextContent.StripHtml())
// .Join("\n");
if (string.IsNullOrWhiteSpace(url)
|| string.IsNullOrWhiteSpace(title)
|| string.IsNullOrWhiteSpace(txt))
return null;
return new PlainSearchResultEntry
{
Title = title,
Url = url,
DisplayUrl = url,
Description = txt
};
})
.Where(x => x is not null)
.ToList();
// return new GoogleSearchResult(results.AsReadOnly(), fullQueryLink, totalResults);
return new PlainGoogleScrapeSearchResult()
{
Answer = null,
Entries = results!,
Info = new PlainSearchResultInfo()
{
SearchTime = time ?? "?",
TotalResults = totalResults ?? "?"
}
};
}
// someone can mr this
public override ITask<IImageSearchResult?> SearchImagesAsync(string query)
=> throw new NotSupportedException();
}

View File

@@ -0,0 +1,8 @@
namespace NadekoBot.Modules.Searches.GoogleScrape;
public class PlainGoogleScrapeSearchResult : ISearchResult
{
public string? Answer { get; init; } = null!;
public IReadOnlyCollection<ISearchResultEntry> Entries { get; init; } = null!;
public ISearchResultInformation Info { get; init; } = null!;
}

View File

@@ -0,0 +1,9 @@
namespace NadekoBot.Modules.Searches.GoogleScrape;
public sealed class PlainSearchResultEntry : ISearchResultEntry
{
public string Title { get; init; } = null!;
public string Url { get; init; } = null!;
public string DisplayUrl { get; init; } = null!;
public string? Description { get; init; } = null!;
}

View File

@@ -0,0 +1,7 @@
namespace NadekoBot.Modules.Searches.GoogleScrape;
public sealed class PlainSearchResultInfo : ISearchResultInformation
{
public string TotalResults { get; init; } = null!;
public string SearchTime { get; init; } = null!;
}

View File

@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
public interface ISearchService
{
ITask<ISearchResult?> SearchAsync(string query);
ITask<ISearchResult?> SearchAsync(string? query);
ITask<IImageSearchResult?> SearchImagesAsync(string query);
}

View File

@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
public abstract class SearchServiceBase : ISearchService
{
public abstract ITask<ISearchResult?> SearchAsync(string query);
public abstract ITask<ISearchResult?> SearchAsync(string? query);
public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query);
}

View File

@@ -25,7 +25,7 @@ public sealed class SearxSearchService : SearchServiceBase, INService
return instances[_rng.Next(0, instances.Count)];
}
public override async ITask<SearxSearchResult> SearchAsync(string query)
public override async ITask<SearxSearchResult> SearchAsync(string? query)
{
ArgumentNullException.ThrowIfNull(query);

View File

@@ -6,13 +6,17 @@ namespace NadekoBot.Modules.Searches;
[Cloneable]
public partial class SearchesConfig : ICloneable<SearchesConfig>
{
[Comment("DO NOT CHANGE")]
public int Version { get; set; } = 0;
[Comment(@"Which engine should .search command
'google' requires googleApiKey and google.searchId set in creds.yml
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google;
'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
'searx' - requires at least one searx instance specified in the 'searxInstances' property below")]
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google_Scrape;
[Comment(@"Which engine should .image command use
'google' requires googleApiKey and google.imageSearchId set in creds.yml
'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google;

View File

@@ -27,5 +27,19 @@ public class SearchesConfigService : ConfigServiceBase<SearchesConfig>
sc => sc.YtProvider,
ConfigParsers.InsensitiveEnum,
ConfigPrinters.ToString);
Migrate();
}
private void Migrate()
{
if (data.Version < 1)
{
ModifyConfig(c =>
{
c.Version = 1;
c.WebSearchEngine = WebSearchEngine.Google_Scrape;
});
}
}
}

View File

@@ -1,7 +1,9 @@
namespace NadekoBot.Modules.Searches;
// ReSharper disable InconsistentNaming
namespace NadekoBot.Modules.Searches;
public enum WebSearchEngine
{
Google,
Google_Scrape,
Searx,
}

View File

@@ -1,9 +1,12 @@
# DO NOT CHANGE
version: 1
# Which engine should .search command
# 'google' requires googleApiKey and google.searchId set in creds.yml
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
webSearchEngine: Google
# 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
# 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
# 'searx' - requires at least one searx instance specified in the 'searxInstances' property below
webSearchEngine: Google_Scrape
# Which engine should .image command use
# 'google' requires googleApiKey and google.imageSearchId set in creds.yml
# 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
imgSearchEngine: Google
# Which search provider will be used for the `.youtube` command.