From 64b2a46c95f2882e2606a706f35f7a5cffef5bef Mon Sep 17 00:00:00 2001 From: Kwoth Date: Tue, 14 Jun 2022 13:18:35 +0200 Subject: [PATCH] Re-added .google search result scraper and set it as the default again. Also added versioning to searches.yml as it was forgotten previously --- .../Search/DefaultSearchServiceFactory.cs | 7 +- .../Search/Google/GoogleSearchService.cs | 2 +- .../GoogleScrape/GoogleScrapeService.cs | 183 ++++++++++++------ .../PlainGoogleScrapeSearchResult.cs | 8 + .../GoogleScrape/PlainSearchResultEntry.cs | 9 + .../GoogleScrape/PlainSearchResultInfo.cs | 7 + .../Modules/Searches/Search/ISearchService.cs | 2 +- .../Searches/Search/SearchServiceBase.cs | 2 +- .../Search/Searx/SearxSearchService.cs | 2 +- .../Searches/_Common/Config/SearchesConfig.cs | 12 +- .../_Common/Config/SearchesConfigService.cs | 14 ++ .../_Common/Config/WebSearchEngine.cs | 4 +- src/NadekoBot/data/searches.yml | 11 +- 13 files changed, 187 insertions(+), 76 deletions(-) create mode 100644 src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainGoogleScrapeSearchResult.cs create mode 100644 src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultEntry.cs create mode 100644 src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultInfo.cs diff --git a/src/NadekoBot/Modules/Searches/Search/DefaultSearchServiceFactory.cs b/src/NadekoBot/Modules/Searches/Search/DefaultSearchServiceFactory.cs index 9060a62e0..6f31dcc8e 100644 --- a/src/NadekoBot/Modules/Searches/Search/DefaultSearchServiceFactory.cs +++ b/src/NadekoBot/Modules/Searches/Search/DefaultSearchServiceFactory.cs @@ -1,4 +1,5 @@ -using NadekoBot.Modules.Searches.Youtube; +using NadekoBot.Modules.Searches.GoogleScrape; +using NadekoBot.Modules.Searches.Youtube; namespace NadekoBot.Modules.Searches; @@ -12,10 +13,12 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi private readonly YtdlYoutubeSearchService _ytdl; private readonly YoutubeDataApiSearchService _ytdata; private readonly InvidiousYtSearchService _iYtSs; + private readonly GoogleScrapeService _gscs; public DefaultSearchServiceFactory( SearchesConfigService scs, GoogleSearchService gss, + GoogleScrapeService gscs, SearxSearchService sss, YtdlpYoutubeSearchService ytdlp, YtdlYoutubeSearchService ytdl, @@ -25,6 +28,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi _scs = scs; _sss = sss; _gss = gss; + _gscs = gscs; _iYtSs = iYtSs; _ytdlp = ytdlp; @@ -36,6 +40,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi => _scs.Data.WebSearchEngine switch { WebSearchEngine.Google => _gss, + WebSearchEngine.Google_Scrape => _gscs, WebSearchEngine.Searx => _sss, _ => _gss }; diff --git a/src/NadekoBot/Modules/Searches/Search/Google/GoogleSearchService.cs b/src/NadekoBot/Modules/Searches/Search/Google/GoogleSearchService.cs index 5639ffa1c..a049e10cb 100644 --- a/src/NadekoBot/Modules/Searches/Search/Google/GoogleSearchService.cs +++ b/src/NadekoBot/Modules/Searches/Search/Google/GoogleSearchService.cs @@ -39,7 +39,7 @@ public sealed class GoogleSearchService : SearchServiceBase, INService return result; } - public override async ITask SearchAsync(string query) + public override async ITask SearchAsync(string? query) { ArgumentNullException.ThrowIfNull(query); diff --git a/src/NadekoBot/Modules/Searches/Search/GoogleScrape/GoogleScrapeService.cs b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/GoogleScrapeService.cs index d9aa576a7..e3da48419 100644 --- a/src/NadekoBot/Modules/Searches/Search/GoogleScrape/GoogleScrapeService.cs +++ b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/GoogleScrapeService.cs @@ -1,62 +1,121 @@ -// using AngleSharp.Html.Dom; -// using MorseCode.ITask; -// using NadekoBot.Modules.Searches.Common; -// -// namespace NadekoBot.Modules.Searches.GoogleScrape; -// -// public sealed class GoogleScrapeService : SearchServiceBase -// { -// public override async ITask SearchAsync(string query) -// { -// ArgumentNullException.ThrowIfNull(query); -// -// query = Uri.EscapeDataString(query)?.Replace(' ', '+'); -// -// var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8"; -// -// using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink); -// msg.Headers.Add("User-Agent", -// "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"); -// msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;"); -// -// using var http = _httpFactory.CreateClient(); -// http.DefaultRequestHeaders.Clear(); -// -// using var response = await http.SendAsync(msg); -// await using var content = await response.Content.ReadAsStreamAsync(); -// -// using var document = await _googleParser.ParseDocumentAsync(content); -// var elems = document.QuerySelectorAll("div.g > div > div"); -// -// var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault(); -// var totalResults = resultsElem?.TextContent; -// //var time = resultsElem.Children.FirstOrDefault()?.TextContent -// //^ this doesn't work for some reason, is completely missing in parsed collection -// if (!elems.Any()) -// return default; -// -// var results = elems.Select(elem => -// { -// var children = elem.Children.ToList(); -// if (children.Count < 2) -// return null; -// -// var href = (children[0].QuerySelector("a") as IHtmlAnchorElement)?.Href; -// var name = children[0].QuerySelector("h3")?.TextContent; -// -// if (href is null || name is null) -// return null; -// -// var txt = children[1].TextContent; -// -// if (string.IsNullOrWhiteSpace(txt)) -// return null; -// -// return new GoogleSearchResult(name, href, txt); -// }) -// .Where(x => x is not null) -// .ToList(); -// -// return new(results.AsReadOnly(), fullQueryLink, totalResults); -// } -// } \ No newline at end of file +using AngleSharp.Html.Dom; +using AngleSharp.Html.Parser; +using MorseCode.ITask; + +namespace NadekoBot.Modules.Searches.GoogleScrape; + +public sealed class GoogleScrapeService : SearchServiceBase, INService +{ + private static readonly HtmlParser _googleParser = new(new() + { + IsScripting = false, + IsEmbedded = false, + IsSupportingProcessingInstructions = false, + IsKeepingSourceReferences = false, + IsNotSupportingFrames = true + }); + + + private readonly IHttpClientFactory _httpFactory; + + public GoogleScrapeService(IHttpClientFactory httpClientFactory) + => _httpFactory = httpClientFactory; + + public override async ITask SearchAsync(string? query) + { + ArgumentNullException.ThrowIfNull(query); + + query = Uri.EscapeDataString(query)?.Replace(' ', '+'); + + var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8"; + + using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink); + msg.Headers.Add("User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"); + msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;"); + + using var http = _httpFactory.CreateClient(); + http.DefaultRequestHeaders.Clear(); + + using var response = await http.SendAsync(msg); + await using var content = await response.Content.ReadAsStreamAsync(); + + using var document = await _googleParser.ParseDocumentAsync(content); + var elems = document.QuerySelectorAll("div.g, div.mnr-c > div > div"); + + var resultsElem = document.QuerySelector("#result-stats"); + var resultsArr = resultsElem?.TextContent.Split("results"); + var totalResults = resultsArr?.Length is null or 0 + ? null + : resultsArr[0]; + + var time = resultsArr is null or {Length: < 2} + ? null + : resultsArr[1] + .Replace("(", string.Empty) + .Replace("seconds)", string.Empty); + + //var time = resultsElem.Children.FirstOrDefault()?.TextContent + //^ this doesn't work for some reason, is completely missing in parsed collection + if (!elems.Any()) + return default; + + var results = elems.Select(elem => + { + var aTag = elem.QuerySelector("a"); + + if (aTag is null) + return null; + + var url = ((IHtmlAnchorElement)aTag).Href; + var title = aTag.QuerySelector("h3")?.TextContent; + + var txt = aTag.ParentElement + ?.NextElementSibling + ?.QuerySelector("span") + ?.TextContent + .StripHtml() + ?? elem + ?.QuerySelectorAll("span") + .Skip(3) + .FirstOrDefault() + ?.TextContent + .StripHtml(); + // .Select(x => x.TextContent.StripHtml()) + // .Join("\n"); + + if (string.IsNullOrWhiteSpace(url) + || string.IsNullOrWhiteSpace(title) + || string.IsNullOrWhiteSpace(txt)) + return null; + + return new PlainSearchResultEntry + { + Title = title, + Url = url, + DisplayUrl = url, + Description = txt + }; + }) + .Where(x => x is not null) + .ToList(); + + // return new GoogleSearchResult(results.AsReadOnly(), fullQueryLink, totalResults); + + return new PlainGoogleScrapeSearchResult() + { + Answer = null, + Entries = results!, + Info = new PlainSearchResultInfo() + { + SearchTime = time ?? "?", + TotalResults = totalResults ?? "?" + } + }; + } + + + // someone can mr this + public override ITask SearchImagesAsync(string query) + => throw new NotSupportedException(); +} \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainGoogleScrapeSearchResult.cs b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainGoogleScrapeSearchResult.cs new file mode 100644 index 000000000..d3aeb7357 --- /dev/null +++ b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainGoogleScrapeSearchResult.cs @@ -0,0 +1,8 @@ +namespace NadekoBot.Modules.Searches.GoogleScrape; + +public class PlainGoogleScrapeSearchResult : ISearchResult +{ + public string? Answer { get; init; } = null!; + public IReadOnlyCollection Entries { get; init; } = null!; + public ISearchResultInformation Info { get; init; } = null!; +} \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultEntry.cs b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultEntry.cs new file mode 100644 index 000000000..872d9ba8c --- /dev/null +++ b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultEntry.cs @@ -0,0 +1,9 @@ +namespace NadekoBot.Modules.Searches.GoogleScrape; + +public sealed class PlainSearchResultEntry : ISearchResultEntry +{ + public string Title { get; init; } = null!; + public string Url { get; init; } = null!; + public string DisplayUrl { get; init; } = null!; + public string? Description { get; init; } = null!; +} \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultInfo.cs b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultInfo.cs new file mode 100644 index 000000000..bdc50dc25 --- /dev/null +++ b/src/NadekoBot/Modules/Searches/Search/GoogleScrape/PlainSearchResultInfo.cs @@ -0,0 +1,7 @@ +namespace NadekoBot.Modules.Searches.GoogleScrape; + +public sealed class PlainSearchResultInfo : ISearchResultInformation +{ + public string TotalResults { get; init; } = null!; + public string SearchTime { get; init; } = null!; +} \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/Search/ISearchService.cs b/src/NadekoBot/Modules/Searches/Search/ISearchService.cs index ac903af90..c5b9ae57d 100644 --- a/src/NadekoBot/Modules/Searches/Search/ISearchService.cs +++ b/src/NadekoBot/Modules/Searches/Search/ISearchService.cs @@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches; public interface ISearchService { - ITask SearchAsync(string query); + ITask SearchAsync(string? query); ITask SearchImagesAsync(string query); } \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/Search/SearchServiceBase.cs b/src/NadekoBot/Modules/Searches/Search/SearchServiceBase.cs index 385dfc5c4..15fe578ad 100644 --- a/src/NadekoBot/Modules/Searches/Search/SearchServiceBase.cs +++ b/src/NadekoBot/Modules/Searches/Search/SearchServiceBase.cs @@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches; public abstract class SearchServiceBase : ISearchService { - public abstract ITask SearchAsync(string query); + public abstract ITask SearchAsync(string? query); public abstract ITask SearchImagesAsync(string query); } \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/Search/Searx/SearxSearchService.cs b/src/NadekoBot/Modules/Searches/Search/Searx/SearxSearchService.cs index 374c601eb..6d0e059bd 100644 --- a/src/NadekoBot/Modules/Searches/Search/Searx/SearxSearchService.cs +++ b/src/NadekoBot/Modules/Searches/Search/Searx/SearxSearchService.cs @@ -25,7 +25,7 @@ public sealed class SearxSearchService : SearchServiceBase, INService return instances[_rng.Next(0, instances.Count)]; } - public override async ITask SearchAsync(string query) + public override async ITask SearchAsync(string? query) { ArgumentNullException.ThrowIfNull(query); diff --git a/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfig.cs b/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfig.cs index 3da7b0743..e05010a0a 100644 --- a/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfig.cs +++ b/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfig.cs @@ -6,13 +6,17 @@ namespace NadekoBot.Modules.Searches; [Cloneable] public partial class SearchesConfig : ICloneable { + [Comment("DO NOT CHANGE")] + public int Version { get; set; } = 0; + [Comment(@"Which engine should .search command -'google' requires googleApiKey and google.searchId set in creds.yml -'searx' requires at least one searx instance specified in the 'searxInstances' property below")] - public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google; +'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys. +'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml +'searx' - requires at least one searx instance specified in the 'searxInstances' property below")] + public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google_Scrape; [Comment(@"Which engine should .image command use -'google' requires googleApiKey and google.imageSearchId set in creds.yml +'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml 'searx' requires at least one searx instance specified in the 'searxInstances' property below")] public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google; diff --git a/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfigService.cs b/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfigService.cs index 3222da3f7..4d58098b4 100644 --- a/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfigService.cs +++ b/src/NadekoBot/Modules/Searches/_Common/Config/SearchesConfigService.cs @@ -27,5 +27,19 @@ public class SearchesConfigService : ConfigServiceBase sc => sc.YtProvider, ConfigParsers.InsensitiveEnum, ConfigPrinters.ToString); + + Migrate(); + } + + private void Migrate() + { + if (data.Version < 1) + { + ModifyConfig(c => + { + c.Version = 1; + c.WebSearchEngine = WebSearchEngine.Google_Scrape; + }); + } } } \ No newline at end of file diff --git a/src/NadekoBot/Modules/Searches/_Common/Config/WebSearchEngine.cs b/src/NadekoBot/Modules/Searches/_Common/Config/WebSearchEngine.cs index 097210e0f..7f5fb36cf 100644 --- a/src/NadekoBot/Modules/Searches/_Common/Config/WebSearchEngine.cs +++ b/src/NadekoBot/Modules/Searches/_Common/Config/WebSearchEngine.cs @@ -1,7 +1,9 @@ -namespace NadekoBot.Modules.Searches; +// ReSharper disable InconsistentNaming +namespace NadekoBot.Modules.Searches; public enum WebSearchEngine { Google, + Google_Scrape, Searx, } \ No newline at end of file diff --git a/src/NadekoBot/data/searches.yml b/src/NadekoBot/data/searches.yml index 343f26b53..4422f2318 100644 --- a/src/NadekoBot/data/searches.yml +++ b/src/NadekoBot/data/searches.yml @@ -1,9 +1,12 @@ +# DO NOT CHANGE +version: 1 # Which engine should .search command -# 'google' requires googleApiKey and google.searchId set in creds.yml -# 'searx' requires at least one searx instance specified in the 'searxInstances' property below -webSearchEngine: Google +# 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys. +# 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml +# 'searx' - requires at least one searx instance specified in the 'searxInstances' property below +webSearchEngine: Google_Scrape # Which engine should .image command use -# 'google' requires googleApiKey and google.imageSearchId set in creds.yml +# 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml # 'searx' requires at least one searx instance specified in the 'searxInstances' property below imgSearchEngine: Google # Which search provider will be used for the `.youtube` command.