Re-added .google search result scraper and set it as the default again. Also added versioning to searches.yml as it was forgotten previously

This commit is contained in:
Kwoth
2022-06-14 13:18:35 +02:00
parent f42deda3e2
commit 64b2a46c95
13 changed files with 187 additions and 76 deletions

View File

@@ -1,4 +1,5 @@
using NadekoBot.Modules.Searches.Youtube; using NadekoBot.Modules.Searches.GoogleScrape;
using NadekoBot.Modules.Searches.Youtube;
namespace NadekoBot.Modules.Searches; namespace NadekoBot.Modules.Searches;
@@ -12,10 +13,12 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
private readonly YtdlYoutubeSearchService _ytdl; private readonly YtdlYoutubeSearchService _ytdl;
private readonly YoutubeDataApiSearchService _ytdata; private readonly YoutubeDataApiSearchService _ytdata;
private readonly InvidiousYtSearchService _iYtSs; private readonly InvidiousYtSearchService _iYtSs;
private readonly GoogleScrapeService _gscs;
public DefaultSearchServiceFactory( public DefaultSearchServiceFactory(
SearchesConfigService scs, SearchesConfigService scs,
GoogleSearchService gss, GoogleSearchService gss,
GoogleScrapeService gscs,
SearxSearchService sss, SearxSearchService sss,
YtdlpYoutubeSearchService ytdlp, YtdlpYoutubeSearchService ytdlp,
YtdlYoutubeSearchService ytdl, YtdlYoutubeSearchService ytdl,
@@ -25,6 +28,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
_scs = scs; _scs = scs;
_sss = sss; _sss = sss;
_gss = gss; _gss = gss;
_gscs = gscs;
_iYtSs = iYtSs; _iYtSs = iYtSs;
_ytdlp = ytdlp; _ytdlp = ytdlp;
@@ -36,6 +40,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
=> _scs.Data.WebSearchEngine switch => _scs.Data.WebSearchEngine switch
{ {
WebSearchEngine.Google => _gss, WebSearchEngine.Google => _gss,
WebSearchEngine.Google_Scrape => _gscs,
WebSearchEngine.Searx => _sss, WebSearchEngine.Searx => _sss,
_ => _gss _ => _gss
}; };

View File

@@ -39,7 +39,7 @@ public sealed class GoogleSearchService : SearchServiceBase, INService
return result; return result;
} }
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string query) public override async ITask<GoogleCustomSearchResult?> SearchAsync(string? query)
{ {
ArgumentNullException.ThrowIfNull(query); ArgumentNullException.ThrowIfNull(query);

View File

@@ -1,62 +1,121 @@
// using AngleSharp.Html.Dom; using AngleSharp.Html.Dom;
// using MorseCode.ITask; using AngleSharp.Html.Parser;
// using NadekoBot.Modules.Searches.Common; using MorseCode.ITask;
//
// namespace NadekoBot.Modules.Searches.GoogleScrape; namespace NadekoBot.Modules.Searches.GoogleScrape;
//
// public sealed class GoogleScrapeService : SearchServiceBase public sealed class GoogleScrapeService : SearchServiceBase, INService
// { {
// public override async ITask<GoogleSearchResultData> SearchAsync(string query) private static readonly HtmlParser _googleParser = new(new()
// { {
// ArgumentNullException.ThrowIfNull(query); IsScripting = false,
// IsEmbedded = false,
// query = Uri.EscapeDataString(query)?.Replace(' ', '+'); IsSupportingProcessingInstructions = false,
// IsKeepingSourceReferences = false,
// var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8"; IsNotSupportingFrames = true
// });
// using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
// msg.Headers.Add("User-Agent",
// "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"); private readonly IHttpClientFactory _httpFactory;
// msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
// public GoogleScrapeService(IHttpClientFactory httpClientFactory)
// using var http = _httpFactory.CreateClient(); => _httpFactory = httpClientFactory;
// http.DefaultRequestHeaders.Clear();
// public override async ITask<ISearchResult?> SearchAsync(string? query)
// using var response = await http.SendAsync(msg); {
// await using var content = await response.Content.ReadAsStreamAsync(); ArgumentNullException.ThrowIfNull(query);
//
// using var document = await _googleParser.ParseDocumentAsync(content); query = Uri.EscapeDataString(query)?.Replace(' ', '+');
// var elems = document.QuerySelectorAll("div.g > div > div");
// var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
// var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault();
// var totalResults = resultsElem?.TextContent; using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
// //var time = resultsElem.Children.FirstOrDefault()?.TextContent msg.Headers.Add("User-Agent",
// //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
// if (!elems.Any()) msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
// return default;
// using var http = _httpFactory.CreateClient();
// var results = elems.Select(elem => http.DefaultRequestHeaders.Clear();
// {
// var children = elem.Children.ToList(); using var response = await http.SendAsync(msg);
// if (children.Count < 2) await using var content = await response.Content.ReadAsStreamAsync();
// return null;
// using var document = await _googleParser.ParseDocumentAsync(content);
// var href = (children[0].QuerySelector("a") as IHtmlAnchorElement)?.Href; var elems = document.QuerySelectorAll("div.g, div.mnr-c > div > div");
// var name = children[0].QuerySelector("h3")?.TextContent;
// var resultsElem = document.QuerySelector("#result-stats");
// if (href is null || name is null) var resultsArr = resultsElem?.TextContent.Split("results");
// return null; var totalResults = resultsArr?.Length is null or 0
// ? null
// var txt = children[1].TextContent; : resultsArr[0];
//
// if (string.IsNullOrWhiteSpace(txt)) var time = resultsArr is null or {Length: < 2}
// return null; ? null
// : resultsArr[1]
// return new GoogleSearchResult(name, href, txt); .Replace("(", string.Empty)
// }) .Replace("seconds)", string.Empty);
// .Where(x => x is not null)
// .ToList(); //var time = resultsElem.Children.FirstOrDefault()?.TextContent
// //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
// return new(results.AsReadOnly(), fullQueryLink, totalResults); if (!elems.Any())
// } return default;
// }
var results = elems.Select(elem =>
{
var aTag = elem.QuerySelector("a");
if (aTag is null)
return null;
var url = ((IHtmlAnchorElement)aTag).Href;
var title = aTag.QuerySelector("h3")?.TextContent;
var txt = aTag.ParentElement
?.NextElementSibling
?.QuerySelector("span")
?.TextContent
.StripHtml()
?? elem
?.QuerySelectorAll("span")
.Skip(3)
.FirstOrDefault()
?.TextContent
.StripHtml();
// .Select(x => x.TextContent.StripHtml())
// .Join("\n");
if (string.IsNullOrWhiteSpace(url)
|| string.IsNullOrWhiteSpace(title)
|| string.IsNullOrWhiteSpace(txt))
return null;
return new PlainSearchResultEntry
{
Title = title,
Url = url,
DisplayUrl = url,
Description = txt
};
})
.Where(x => x is not null)
.ToList();
// return new GoogleSearchResult(results.AsReadOnly(), fullQueryLink, totalResults);
return new PlainGoogleScrapeSearchResult()
{
Answer = null,
Entries = results!,
Info = new PlainSearchResultInfo()
{
SearchTime = time ?? "?",
TotalResults = totalResults ?? "?"
}
};
}
// someone can mr this
public override ITask<IImageSearchResult?> SearchImagesAsync(string query)
=> throw new NotSupportedException();
}

View File

@@ -0,0 +1,8 @@
namespace NadekoBot.Modules.Searches.GoogleScrape;
public class PlainGoogleScrapeSearchResult : ISearchResult
{
public string? Answer { get; init; } = null!;
public IReadOnlyCollection<ISearchResultEntry> Entries { get; init; } = null!;
public ISearchResultInformation Info { get; init; } = null!;
}

View File

@@ -0,0 +1,9 @@
namespace NadekoBot.Modules.Searches.GoogleScrape;
public sealed class PlainSearchResultEntry : ISearchResultEntry
{
public string Title { get; init; } = null!;
public string Url { get; init; } = null!;
public string DisplayUrl { get; init; } = null!;
public string? Description { get; init; } = null!;
}

View File

@@ -0,0 +1,7 @@
namespace NadekoBot.Modules.Searches.GoogleScrape;
public sealed class PlainSearchResultInfo : ISearchResultInformation
{
public string TotalResults { get; init; } = null!;
public string SearchTime { get; init; } = null!;
}

View File

@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
public interface ISearchService public interface ISearchService
{ {
ITask<ISearchResult?> SearchAsync(string query); ITask<ISearchResult?> SearchAsync(string? query);
ITask<IImageSearchResult?> SearchImagesAsync(string query); ITask<IImageSearchResult?> SearchImagesAsync(string query);
} }

View File

@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
public abstract class SearchServiceBase : ISearchService public abstract class SearchServiceBase : ISearchService
{ {
public abstract ITask<ISearchResult?> SearchAsync(string query); public abstract ITask<ISearchResult?> SearchAsync(string? query);
public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query); public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query);
} }

View File

@@ -25,7 +25,7 @@ public sealed class SearxSearchService : SearchServiceBase, INService
return instances[_rng.Next(0, instances.Count)]; return instances[_rng.Next(0, instances.Count)];
} }
public override async ITask<SearxSearchResult> SearchAsync(string query) public override async ITask<SearxSearchResult> SearchAsync(string? query)
{ {
ArgumentNullException.ThrowIfNull(query); ArgumentNullException.ThrowIfNull(query);

View File

@@ -6,13 +6,17 @@ namespace NadekoBot.Modules.Searches;
[Cloneable] [Cloneable]
public partial class SearchesConfig : ICloneable<SearchesConfig> public partial class SearchesConfig : ICloneable<SearchesConfig>
{ {
[Comment("DO NOT CHANGE")]
public int Version { get; set; } = 0;
[Comment(@"Which engine should .search command [Comment(@"Which engine should .search command
'google' requires googleApiKey and google.searchId set in creds.yml 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
'searx' requires at least one searx instance specified in the 'searxInstances' property below")] 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google; 'searx' - requires at least one searx instance specified in the 'searxInstances' property below")]
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google_Scrape;
[Comment(@"Which engine should .image command use [Comment(@"Which engine should .image command use
'google' requires googleApiKey and google.imageSearchId set in creds.yml 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
'searx' requires at least one searx instance specified in the 'searxInstances' property below")] 'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google; public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google;

View File

@@ -27,5 +27,19 @@ public class SearchesConfigService : ConfigServiceBase<SearchesConfig>
sc => sc.YtProvider, sc => sc.YtProvider,
ConfigParsers.InsensitiveEnum, ConfigParsers.InsensitiveEnum,
ConfigPrinters.ToString); ConfigPrinters.ToString);
Migrate();
}
private void Migrate()
{
if (data.Version < 1)
{
ModifyConfig(c =>
{
c.Version = 1;
c.WebSearchEngine = WebSearchEngine.Google_Scrape;
});
}
} }
} }

View File

@@ -1,7 +1,9 @@
namespace NadekoBot.Modules.Searches; // ReSharper disable InconsistentNaming
namespace NadekoBot.Modules.Searches;
public enum WebSearchEngine public enum WebSearchEngine
{ {
Google, Google,
Google_Scrape,
Searx, Searx,
} }

View File

@@ -1,9 +1,12 @@
# DO NOT CHANGE
version: 1
# Which engine should .search command # Which engine should .search command
# 'google' requires googleApiKey and google.searchId set in creds.yml # 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below # 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
webSearchEngine: Google # 'searx' - requires at least one searx instance specified in the 'searxInstances' property below
webSearchEngine: Google_Scrape
# Which engine should .image command use # Which engine should .image command use
# 'google' requires googleApiKey and google.imageSearchId set in creds.yml # 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below # 'searx' requires at least one searx instance specified in the 'searxInstances' property below
imgSearchEngine: Google imgSearchEngine: Google
# Which search provider will be used for the `.youtube` command. # Which search provider will be used for the `.youtube` command.