mirror of
				https://gitlab.com/Kwoth/nadekobot.git
				synced 2025-11-04 00:34:26 -05:00 
			
		
		
		
	Re-added .google search result scraper and set it as the default again. Also added versioning to searches.yml as it was forgotten previously
This commit is contained in:
		@@ -1,4 +1,5 @@
 | 
			
		||||
using NadekoBot.Modules.Searches.Youtube;
 | 
			
		||||
using NadekoBot.Modules.Searches.GoogleScrape;
 | 
			
		||||
using NadekoBot.Modules.Searches.Youtube;
 | 
			
		||||
 | 
			
		||||
namespace NadekoBot.Modules.Searches;
 | 
			
		||||
 | 
			
		||||
@@ -12,10 +13,12 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
 | 
			
		||||
    private readonly YtdlYoutubeSearchService _ytdl;
 | 
			
		||||
    private readonly YoutubeDataApiSearchService _ytdata;
 | 
			
		||||
    private readonly InvidiousYtSearchService _iYtSs;
 | 
			
		||||
    private readonly GoogleScrapeService _gscs;
 | 
			
		||||
 | 
			
		||||
    public DefaultSearchServiceFactory(
 | 
			
		||||
        SearchesConfigService scs,
 | 
			
		||||
        GoogleSearchService gss,
 | 
			
		||||
        GoogleScrapeService gscs,
 | 
			
		||||
        SearxSearchService sss,
 | 
			
		||||
        YtdlpYoutubeSearchService ytdlp,
 | 
			
		||||
        YtdlYoutubeSearchService ytdl,
 | 
			
		||||
@@ -25,6 +28,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
 | 
			
		||||
        _scs = scs;
 | 
			
		||||
        _sss = sss;
 | 
			
		||||
        _gss = gss;
 | 
			
		||||
        _gscs = gscs;
 | 
			
		||||
        _iYtSs = iYtSs;
 | 
			
		||||
 | 
			
		||||
        _ytdlp = ytdlp;
 | 
			
		||||
@@ -36,6 +40,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
 | 
			
		||||
        => _scs.Data.WebSearchEngine switch
 | 
			
		||||
        {
 | 
			
		||||
            WebSearchEngine.Google => _gss,
 | 
			
		||||
            WebSearchEngine.Google_Scrape => _gscs,
 | 
			
		||||
            WebSearchEngine.Searx => _sss,
 | 
			
		||||
            _ => _gss
 | 
			
		||||
        };
 | 
			
		||||
 
 | 
			
		||||
@@ -39,7 +39,7 @@ public sealed class GoogleSearchService : SearchServiceBase, INService
 | 
			
		||||
        return result;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    public override async ITask<GoogleCustomSearchResult?> SearchAsync(string query)
 | 
			
		||||
    public override async ITask<GoogleCustomSearchResult?> SearchAsync(string? query)
 | 
			
		||||
    {
 | 
			
		||||
        ArgumentNullException.ThrowIfNull(query);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,62 +1,121 @@
 | 
			
		||||
// using AngleSharp.Html.Dom;
 | 
			
		||||
// using MorseCode.ITask;
 | 
			
		||||
// using NadekoBot.Modules.Searches.Common;
 | 
			
		||||
//
 | 
			
		||||
// namespace NadekoBot.Modules.Searches.GoogleScrape;
 | 
			
		||||
//
 | 
			
		||||
// public sealed class GoogleScrapeService : SearchServiceBase
 | 
			
		||||
// {
 | 
			
		||||
//     public override async ITask<GoogleSearchResultData> SearchAsync(string query)
 | 
			
		||||
//     {
 | 
			
		||||
//         ArgumentNullException.ThrowIfNull(query);
 | 
			
		||||
//         
 | 
			
		||||
//         query = Uri.EscapeDataString(query)?.Replace(' ', '+');
 | 
			
		||||
//
 | 
			
		||||
//         var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
 | 
			
		||||
//
 | 
			
		||||
//         using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
 | 
			
		||||
//         msg.Headers.Add("User-Agent",
 | 
			
		||||
//             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
 | 
			
		||||
//         msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
 | 
			
		||||
//
 | 
			
		||||
//         using var http = _httpFactory.CreateClient();
 | 
			
		||||
//         http.DefaultRequestHeaders.Clear();
 | 
			
		||||
//
 | 
			
		||||
//         using var response = await http.SendAsync(msg);
 | 
			
		||||
//         await using var content = await response.Content.ReadAsStreamAsync();
 | 
			
		||||
//
 | 
			
		||||
//         using var document = await _googleParser.ParseDocumentAsync(content);
 | 
			
		||||
//         var elems = document.QuerySelectorAll("div.g > div > div");
 | 
			
		||||
//
 | 
			
		||||
//         var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault();
 | 
			
		||||
//         var totalResults = resultsElem?.TextContent;
 | 
			
		||||
//         //var time = resultsElem.Children.FirstOrDefault()?.TextContent
 | 
			
		||||
//         //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
 | 
			
		||||
//         if (!elems.Any())
 | 
			
		||||
//             return default;
 | 
			
		||||
//
 | 
			
		||||
//         var results = elems.Select(elem =>
 | 
			
		||||
//                            {
 | 
			
		||||
//                                var children = elem.Children.ToList();
 | 
			
		||||
//                                if (children.Count < 2)
 | 
			
		||||
//                                    return null;
 | 
			
		||||
//
 | 
			
		||||
//                                var href = (children[0].QuerySelector("a") as IHtmlAnchorElement)?.Href;
 | 
			
		||||
//                                var name = children[0].QuerySelector("h3")?.TextContent;
 | 
			
		||||
//
 | 
			
		||||
//                                if (href is null || name is null)
 | 
			
		||||
//                                    return null;
 | 
			
		||||
//
 | 
			
		||||
//                                var txt = children[1].TextContent;
 | 
			
		||||
//
 | 
			
		||||
//                                if (string.IsNullOrWhiteSpace(txt))
 | 
			
		||||
//                                    return null;
 | 
			
		||||
//
 | 
			
		||||
//                                return new GoogleSearchResult(name, href, txt);
 | 
			
		||||
//                            })
 | 
			
		||||
//                            .Where(x => x is not null)
 | 
			
		||||
//                            .ToList();
 | 
			
		||||
//
 | 
			
		||||
//         return new(results.AsReadOnly(), fullQueryLink, totalResults);
 | 
			
		||||
//     }
 | 
			
		||||
// }
 | 
			
		||||
using AngleSharp.Html.Dom;
 | 
			
		||||
using AngleSharp.Html.Parser;
 | 
			
		||||
using MorseCode.ITask;
 | 
			
		||||
 | 
			
		||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
 | 
			
		||||
 | 
			
		||||
public sealed class GoogleScrapeService : SearchServiceBase, INService
 | 
			
		||||
{
 | 
			
		||||
    private static readonly HtmlParser _googleParser = new(new()
 | 
			
		||||
    {
 | 
			
		||||
        IsScripting = false,
 | 
			
		||||
        IsEmbedded = false,
 | 
			
		||||
        IsSupportingProcessingInstructions = false,
 | 
			
		||||
        IsKeepingSourceReferences = false,
 | 
			
		||||
        IsNotSupportingFrames = true
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    private readonly IHttpClientFactory _httpFactory;
 | 
			
		||||
 | 
			
		||||
    public GoogleScrapeService(IHttpClientFactory httpClientFactory)
 | 
			
		||||
        => _httpFactory = httpClientFactory;
 | 
			
		||||
 | 
			
		||||
    public override async ITask<ISearchResult?> SearchAsync(string? query)
 | 
			
		||||
    {
 | 
			
		||||
        ArgumentNullException.ThrowIfNull(query);
 | 
			
		||||
        
 | 
			
		||||
        query = Uri.EscapeDataString(query)?.Replace(' ', '+');
 | 
			
		||||
 | 
			
		||||
        var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
 | 
			
		||||
 | 
			
		||||
        using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
 | 
			
		||||
        msg.Headers.Add("User-Agent",
 | 
			
		||||
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
 | 
			
		||||
        msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
 | 
			
		||||
 | 
			
		||||
        using var http = _httpFactory.CreateClient();
 | 
			
		||||
        http.DefaultRequestHeaders.Clear();
 | 
			
		||||
 | 
			
		||||
        using var response = await http.SendAsync(msg);
 | 
			
		||||
        await using var content = await response.Content.ReadAsStreamAsync();
 | 
			
		||||
 | 
			
		||||
        using var document = await _googleParser.ParseDocumentAsync(content);
 | 
			
		||||
        var elems = document.QuerySelectorAll("div.g, div.mnr-c > div > div");
 | 
			
		||||
 | 
			
		||||
        var resultsElem = document.QuerySelector("#result-stats");
 | 
			
		||||
        var resultsArr = resultsElem?.TextContent.Split("results");
 | 
			
		||||
        var totalResults = resultsArr?.Length is null or 0
 | 
			
		||||
            ? null
 | 
			
		||||
            : resultsArr[0];
 | 
			
		||||
 | 
			
		||||
        var time = resultsArr is null or {Length: < 2}
 | 
			
		||||
            ? null
 | 
			
		||||
            : resultsArr[1]
 | 
			
		||||
              .Replace("(", string.Empty)
 | 
			
		||||
              .Replace("seconds)", string.Empty);
 | 
			
		||||
        
 | 
			
		||||
        //var time = resultsElem.Children.FirstOrDefault()?.TextContent
 | 
			
		||||
        //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
 | 
			
		||||
        if (!elems.Any())
 | 
			
		||||
            return default;
 | 
			
		||||
 | 
			
		||||
        var results = elems.Select(elem =>
 | 
			
		||||
                           {
 | 
			
		||||
                               var aTag = elem.QuerySelector("a");
 | 
			
		||||
 | 
			
		||||
                               if (aTag is null)
 | 
			
		||||
                                   return null;
 | 
			
		||||
 | 
			
		||||
                               var url = ((IHtmlAnchorElement)aTag).Href;
 | 
			
		||||
                               var title = aTag.QuerySelector("h3")?.TextContent;
 | 
			
		||||
 | 
			
		||||
                               var txt = aTag.ParentElement
 | 
			
		||||
                                             ?.NextElementSibling
 | 
			
		||||
                                             ?.QuerySelector("span")
 | 
			
		||||
                                             ?.TextContent
 | 
			
		||||
                                             .StripHtml()
 | 
			
		||||
                                         ?? elem
 | 
			
		||||
                                            ?.QuerySelectorAll("span")
 | 
			
		||||
                                            .Skip(3)
 | 
			
		||||
                                            .FirstOrDefault()
 | 
			
		||||
                                            ?.TextContent
 | 
			
		||||
                                            .StripHtml();
 | 
			
		||||
                                             // .Select(x => x.TextContent.StripHtml())
 | 
			
		||||
                                             // .Join("\n");
 | 
			
		||||
 | 
			
		||||
                               if (string.IsNullOrWhiteSpace(url)
 | 
			
		||||
                                   || string.IsNullOrWhiteSpace(title)
 | 
			
		||||
                                   || string.IsNullOrWhiteSpace(txt))
 | 
			
		||||
                                   return null;
 | 
			
		||||
 | 
			
		||||
                               return new PlainSearchResultEntry
 | 
			
		||||
                               {
 | 
			
		||||
                                   Title = title,
 | 
			
		||||
                                   Url = url,
 | 
			
		||||
                                   DisplayUrl = url,
 | 
			
		||||
                                   Description = txt
 | 
			
		||||
                               };
 | 
			
		||||
                           })
 | 
			
		||||
                           .Where(x => x is not null)
 | 
			
		||||
                           .ToList();
 | 
			
		||||
 | 
			
		||||
        // return new GoogleSearchResult(results.AsReadOnly(), fullQueryLink, totalResults);
 | 
			
		||||
 | 
			
		||||
        return new PlainGoogleScrapeSearchResult()
 | 
			
		||||
        {
 | 
			
		||||
            Answer = null,
 | 
			
		||||
            Entries = results!,
 | 
			
		||||
            Info = new PlainSearchResultInfo()
 | 
			
		||||
            {
 | 
			
		||||
                SearchTime = time ?? "?",
 | 
			
		||||
                TotalResults = totalResults ?? "?"
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    // someone can mr this
 | 
			
		||||
    public override ITask<IImageSearchResult?> SearchImagesAsync(string query)
 | 
			
		||||
        => throw new NotSupportedException();
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,8 @@
 | 
			
		||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
 | 
			
		||||
 | 
			
		||||
public class PlainGoogleScrapeSearchResult : ISearchResult
 | 
			
		||||
{
 | 
			
		||||
    public string? Answer { get; init;  } = null!;
 | 
			
		||||
    public IReadOnlyCollection<ISearchResultEntry> Entries { get; init; } = null!;
 | 
			
		||||
    public ISearchResultInformation Info { get; init; } = null!;
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,9 @@
 | 
			
		||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
 | 
			
		||||
 | 
			
		||||
public sealed class PlainSearchResultEntry : ISearchResultEntry
 | 
			
		||||
{
 | 
			
		||||
    public string Title { get; init; } = null!;
 | 
			
		||||
    public string Url { get; init; } = null!;
 | 
			
		||||
    public string DisplayUrl { get; init; } = null!;
 | 
			
		||||
    public string? Description { get; init; } = null!;
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,7 @@
 | 
			
		||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
 | 
			
		||||
 | 
			
		||||
public sealed class PlainSearchResultInfo : ISearchResultInformation
 | 
			
		||||
{
 | 
			
		||||
    public string TotalResults { get; init; } = null!;
 | 
			
		||||
    public string SearchTime { get; init; } = null!;
 | 
			
		||||
}
 | 
			
		||||
@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
 | 
			
		||||
 | 
			
		||||
public interface ISearchService
 | 
			
		||||
{
 | 
			
		||||
    ITask<ISearchResult?> SearchAsync(string query);
 | 
			
		||||
    ITask<ISearchResult?> SearchAsync(string? query);
 | 
			
		||||
    ITask<IImageSearchResult?> SearchImagesAsync(string query);
 | 
			
		||||
}
 | 
			
		||||
@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
 | 
			
		||||
 | 
			
		||||
public abstract class SearchServiceBase : ISearchService
 | 
			
		||||
{
 | 
			
		||||
    public abstract ITask<ISearchResult?> SearchAsync(string query);
 | 
			
		||||
    public abstract ITask<ISearchResult?> SearchAsync(string? query);
 | 
			
		||||
    public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query);
 | 
			
		||||
}
 | 
			
		||||
@@ -25,7 +25,7 @@ public sealed class SearxSearchService : SearchServiceBase, INService
 | 
			
		||||
        return instances[_rng.Next(0, instances.Count)];
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    public override async ITask<SearxSearchResult> SearchAsync(string query)
 | 
			
		||||
    public override async ITask<SearxSearchResult> SearchAsync(string? query)
 | 
			
		||||
    {
 | 
			
		||||
        ArgumentNullException.ThrowIfNull(query);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -6,13 +6,17 @@ namespace NadekoBot.Modules.Searches;
 | 
			
		||||
[Cloneable]
 | 
			
		||||
public partial class SearchesConfig : ICloneable<SearchesConfig>
 | 
			
		||||
{
 | 
			
		||||
    [Comment("DO NOT CHANGE")]
 | 
			
		||||
    public int Version { get; set; } = 0;
 | 
			
		||||
    
 | 
			
		||||
    [Comment(@"Which engine should .search command
 | 
			
		||||
'google' requires googleApiKey and google.searchId set in creds.yml
 | 
			
		||||
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
 | 
			
		||||
    public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google;
 | 
			
		||||
'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
 | 
			
		||||
'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
 | 
			
		||||
'searx' - requires at least one searx instance specified in the 'searxInstances' property below")]
 | 
			
		||||
    public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google_Scrape;
 | 
			
		||||
    
 | 
			
		||||
    [Comment(@"Which engine should .image command use
 | 
			
		||||
'google' requires googleApiKey and google.imageSearchId set in creds.yml
 | 
			
		||||
'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
 | 
			
		||||
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
 | 
			
		||||
    public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google;
 | 
			
		||||
    
 | 
			
		||||
 
 | 
			
		||||
@@ -27,5 +27,19 @@ public class SearchesConfigService : ConfigServiceBase<SearchesConfig>
 | 
			
		||||
            sc => sc.YtProvider,
 | 
			
		||||
            ConfigParsers.InsensitiveEnum,
 | 
			
		||||
            ConfigPrinters.ToString);
 | 
			
		||||
 | 
			
		||||
        Migrate();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    private void Migrate()
 | 
			
		||||
    {
 | 
			
		||||
        if (data.Version < 1)
 | 
			
		||||
        {
 | 
			
		||||
            ModifyConfig(c =>
 | 
			
		||||
            {
 | 
			
		||||
                c.Version = 1;
 | 
			
		||||
                c.WebSearchEngine = WebSearchEngine.Google_Scrape;
 | 
			
		||||
            });
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -1,7 +1,9 @@
 | 
			
		||||
namespace NadekoBot.Modules.Searches;
 | 
			
		||||
// ReSharper disable InconsistentNaming
 | 
			
		||||
namespace NadekoBot.Modules.Searches;
 | 
			
		||||
 | 
			
		||||
public enum WebSearchEngine
 | 
			
		||||
{
 | 
			
		||||
    Google,
 | 
			
		||||
    Google_Scrape,
 | 
			
		||||
    Searx,
 | 
			
		||||
}
 | 
			
		||||
@@ -1,9 +1,12 @@
 | 
			
		||||
# DO NOT CHANGE
 | 
			
		||||
version: 1
 | 
			
		||||
# Which engine should .search command
 | 
			
		||||
# 'google' requires googleApiKey and google.searchId set in creds.yml
 | 
			
		||||
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
 | 
			
		||||
webSearchEngine: Google
 | 
			
		||||
# 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
 | 
			
		||||
# 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
 | 
			
		||||
# 'searx' - requires at least one searx instance specified in the 'searxInstances' property below
 | 
			
		||||
webSearchEngine: Google_Scrape
 | 
			
		||||
# Which engine should .image command use
 | 
			
		||||
# 'google' requires googleApiKey and google.imageSearchId set in creds.yml
 | 
			
		||||
# 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
 | 
			
		||||
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
 | 
			
		||||
imgSearchEngine: Google
 | 
			
		||||
# Which search provider will be used for the `.youtube` command.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user