mirror of
https://gitlab.com/Kwoth/nadekobot.git
synced 2025-09-10 17:28:27 -04:00
Re-added .google search result scraper and set it as the default again. Also added versioning to searches.yml as it was forgotten previously
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
using NadekoBot.Modules.Searches.Youtube;
|
||||
using NadekoBot.Modules.Searches.GoogleScrape;
|
||||
using NadekoBot.Modules.Searches.Youtube;
|
||||
|
||||
namespace NadekoBot.Modules.Searches;
|
||||
|
||||
@@ -12,10 +13,12 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
|
||||
private readonly YtdlYoutubeSearchService _ytdl;
|
||||
private readonly YoutubeDataApiSearchService _ytdata;
|
||||
private readonly InvidiousYtSearchService _iYtSs;
|
||||
private readonly GoogleScrapeService _gscs;
|
||||
|
||||
public DefaultSearchServiceFactory(
|
||||
SearchesConfigService scs,
|
||||
GoogleSearchService gss,
|
||||
GoogleScrapeService gscs,
|
||||
SearxSearchService sss,
|
||||
YtdlpYoutubeSearchService ytdlp,
|
||||
YtdlYoutubeSearchService ytdl,
|
||||
@@ -25,6 +28,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
|
||||
_scs = scs;
|
||||
_sss = sss;
|
||||
_gss = gss;
|
||||
_gscs = gscs;
|
||||
_iYtSs = iYtSs;
|
||||
|
||||
_ytdlp = ytdlp;
|
||||
@@ -36,6 +40,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
|
||||
=> _scs.Data.WebSearchEngine switch
|
||||
{
|
||||
WebSearchEngine.Google => _gss,
|
||||
WebSearchEngine.Google_Scrape => _gscs,
|
||||
WebSearchEngine.Searx => _sss,
|
||||
_ => _gss
|
||||
};
|
||||
|
@@ -39,7 +39,7 @@ public sealed class GoogleSearchService : SearchServiceBase, INService
|
||||
return result;
|
||||
}
|
||||
|
||||
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string query)
|
||||
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string? query)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
|
||||
|
@@ -1,62 +1,121 @@
|
||||
// using AngleSharp.Html.Dom;
|
||||
// using MorseCode.ITask;
|
||||
// using NadekoBot.Modules.Searches.Common;
|
||||
//
|
||||
// namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||
//
|
||||
// public sealed class GoogleScrapeService : SearchServiceBase
|
||||
// {
|
||||
// public override async ITask<GoogleSearchResultData> SearchAsync(string query)
|
||||
// {
|
||||
// ArgumentNullException.ThrowIfNull(query);
|
||||
//
|
||||
// query = Uri.EscapeDataString(query)?.Replace(' ', '+');
|
||||
//
|
||||
// var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
|
||||
//
|
||||
// using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
|
||||
// msg.Headers.Add("User-Agent",
|
||||
// "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
|
||||
// msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
|
||||
//
|
||||
// using var http = _httpFactory.CreateClient();
|
||||
// http.DefaultRequestHeaders.Clear();
|
||||
//
|
||||
// using var response = await http.SendAsync(msg);
|
||||
// await using var content = await response.Content.ReadAsStreamAsync();
|
||||
//
|
||||
// using var document = await _googleParser.ParseDocumentAsync(content);
|
||||
// var elems = document.QuerySelectorAll("div.g > div > div");
|
||||
//
|
||||
// var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault();
|
||||
// var totalResults = resultsElem?.TextContent;
|
||||
// //var time = resultsElem.Children.FirstOrDefault()?.TextContent
|
||||
// //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
|
||||
// if (!elems.Any())
|
||||
// return default;
|
||||
//
|
||||
// var results = elems.Select(elem =>
|
||||
// {
|
||||
// var children = elem.Children.ToList();
|
||||
// if (children.Count < 2)
|
||||
// return null;
|
||||
//
|
||||
// var href = (children[0].QuerySelector("a") as IHtmlAnchorElement)?.Href;
|
||||
// var name = children[0].QuerySelector("h3")?.TextContent;
|
||||
//
|
||||
// if (href is null || name is null)
|
||||
// return null;
|
||||
//
|
||||
// var txt = children[1].TextContent;
|
||||
//
|
||||
// if (string.IsNullOrWhiteSpace(txt))
|
||||
// return null;
|
||||
//
|
||||
// return new GoogleSearchResult(name, href, txt);
|
||||
// })
|
||||
// .Where(x => x is not null)
|
||||
// .ToList();
|
||||
//
|
||||
// return new(results.AsReadOnly(), fullQueryLink, totalResults);
|
||||
// }
|
||||
// }
|
||||
using AngleSharp.Html.Dom;
|
||||
using AngleSharp.Html.Parser;
|
||||
using MorseCode.ITask;
|
||||
|
||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||
|
||||
public sealed class GoogleScrapeService : SearchServiceBase, INService
|
||||
{
|
||||
private static readonly HtmlParser _googleParser = new(new()
|
||||
{
|
||||
IsScripting = false,
|
||||
IsEmbedded = false,
|
||||
IsSupportingProcessingInstructions = false,
|
||||
IsKeepingSourceReferences = false,
|
||||
IsNotSupportingFrames = true
|
||||
});
|
||||
|
||||
|
||||
private readonly IHttpClientFactory _httpFactory;
|
||||
|
||||
public GoogleScrapeService(IHttpClientFactory httpClientFactory)
|
||||
=> _httpFactory = httpClientFactory;
|
||||
|
||||
public override async ITask<ISearchResult?> SearchAsync(string? query)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
|
||||
query = Uri.EscapeDataString(query)?.Replace(' ', '+');
|
||||
|
||||
var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
|
||||
|
||||
using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
|
||||
msg.Headers.Add("User-Agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
|
||||
msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
|
||||
|
||||
using var http = _httpFactory.CreateClient();
|
||||
http.DefaultRequestHeaders.Clear();
|
||||
|
||||
using var response = await http.SendAsync(msg);
|
||||
await using var content = await response.Content.ReadAsStreamAsync();
|
||||
|
||||
using var document = await _googleParser.ParseDocumentAsync(content);
|
||||
var elems = document.QuerySelectorAll("div.g, div.mnr-c > div > div");
|
||||
|
||||
var resultsElem = document.QuerySelector("#result-stats");
|
||||
var resultsArr = resultsElem?.TextContent.Split("results");
|
||||
var totalResults = resultsArr?.Length is null or 0
|
||||
? null
|
||||
: resultsArr[0];
|
||||
|
||||
var time = resultsArr is null or {Length: < 2}
|
||||
? null
|
||||
: resultsArr[1]
|
||||
.Replace("(", string.Empty)
|
||||
.Replace("seconds)", string.Empty);
|
||||
|
||||
//var time = resultsElem.Children.FirstOrDefault()?.TextContent
|
||||
//^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
|
||||
if (!elems.Any())
|
||||
return default;
|
||||
|
||||
var results = elems.Select(elem =>
|
||||
{
|
||||
var aTag = elem.QuerySelector("a");
|
||||
|
||||
if (aTag is null)
|
||||
return null;
|
||||
|
||||
var url = ((IHtmlAnchorElement)aTag).Href;
|
||||
var title = aTag.QuerySelector("h3")?.TextContent;
|
||||
|
||||
var txt = aTag.ParentElement
|
||||
?.NextElementSibling
|
||||
?.QuerySelector("span")
|
||||
?.TextContent
|
||||
.StripHtml()
|
||||
?? elem
|
||||
?.QuerySelectorAll("span")
|
||||
.Skip(3)
|
||||
.FirstOrDefault()
|
||||
?.TextContent
|
||||
.StripHtml();
|
||||
// .Select(x => x.TextContent.StripHtml())
|
||||
// .Join("\n");
|
||||
|
||||
if (string.IsNullOrWhiteSpace(url)
|
||||
|| string.IsNullOrWhiteSpace(title)
|
||||
|| string.IsNullOrWhiteSpace(txt))
|
||||
return null;
|
||||
|
||||
return new PlainSearchResultEntry
|
||||
{
|
||||
Title = title,
|
||||
Url = url,
|
||||
DisplayUrl = url,
|
||||
Description = txt
|
||||
};
|
||||
})
|
||||
.Where(x => x is not null)
|
||||
.ToList();
|
||||
|
||||
// return new GoogleSearchResult(results.AsReadOnly(), fullQueryLink, totalResults);
|
||||
|
||||
return new PlainGoogleScrapeSearchResult()
|
||||
{
|
||||
Answer = null,
|
||||
Entries = results!,
|
||||
Info = new PlainSearchResultInfo()
|
||||
{
|
||||
SearchTime = time ?? "?",
|
||||
TotalResults = totalResults ?? "?"
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// someone can mr this
|
||||
public override ITask<IImageSearchResult?> SearchImagesAsync(string query)
|
||||
=> throw new NotSupportedException();
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||
|
||||
public class PlainGoogleScrapeSearchResult : ISearchResult
|
||||
{
|
||||
public string? Answer { get; init; } = null!;
|
||||
public IReadOnlyCollection<ISearchResultEntry> Entries { get; init; } = null!;
|
||||
public ISearchResultInformation Info { get; init; } = null!;
|
||||
}
|
@@ -0,0 +1,9 @@
|
||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||
|
||||
public sealed class PlainSearchResultEntry : ISearchResultEntry
|
||||
{
|
||||
public string Title { get; init; } = null!;
|
||||
public string Url { get; init; } = null!;
|
||||
public string DisplayUrl { get; init; } = null!;
|
||||
public string? Description { get; init; } = null!;
|
||||
}
|
@@ -0,0 +1,7 @@
|
||||
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||
|
||||
public sealed class PlainSearchResultInfo : ISearchResultInformation
|
||||
{
|
||||
public string TotalResults { get; init; } = null!;
|
||||
public string SearchTime { get; init; } = null!;
|
||||
}
|
@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
|
||||
|
||||
public interface ISearchService
|
||||
{
|
||||
ITask<ISearchResult?> SearchAsync(string query);
|
||||
ITask<ISearchResult?> SearchAsync(string? query);
|
||||
ITask<IImageSearchResult?> SearchImagesAsync(string query);
|
||||
}
|
@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
|
||||
|
||||
public abstract class SearchServiceBase : ISearchService
|
||||
{
|
||||
public abstract ITask<ISearchResult?> SearchAsync(string query);
|
||||
public abstract ITask<ISearchResult?> SearchAsync(string? query);
|
||||
public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query);
|
||||
}
|
@@ -25,7 +25,7 @@ public sealed class SearxSearchService : SearchServiceBase, INService
|
||||
return instances[_rng.Next(0, instances.Count)];
|
||||
}
|
||||
|
||||
public override async ITask<SearxSearchResult> SearchAsync(string query)
|
||||
public override async ITask<SearxSearchResult> SearchAsync(string? query)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
|
||||
|
@@ -6,13 +6,17 @@ namespace NadekoBot.Modules.Searches;
|
||||
[Cloneable]
|
||||
public partial class SearchesConfig : ICloneable<SearchesConfig>
|
||||
{
|
||||
[Comment("DO NOT CHANGE")]
|
||||
public int Version { get; set; } = 0;
|
||||
|
||||
[Comment(@"Which engine should .search command
|
||||
'google' requires googleApiKey and google.searchId set in creds.yml
|
||||
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
|
||||
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google;
|
||||
'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
|
||||
'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
|
||||
'searx' - requires at least one searx instance specified in the 'searxInstances' property below")]
|
||||
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google_Scrape;
|
||||
|
||||
[Comment(@"Which engine should .image command use
|
||||
'google' requires googleApiKey and google.imageSearchId set in creds.yml
|
||||
'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
|
||||
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
|
||||
public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google;
|
||||
|
||||
|
@@ -27,5 +27,19 @@ public class SearchesConfigService : ConfigServiceBase<SearchesConfig>
|
||||
sc => sc.YtProvider,
|
||||
ConfigParsers.InsensitiveEnum,
|
||||
ConfigPrinters.ToString);
|
||||
|
||||
Migrate();
|
||||
}
|
||||
|
||||
private void Migrate()
|
||||
{
|
||||
if (data.Version < 1)
|
||||
{
|
||||
ModifyConfig(c =>
|
||||
{
|
||||
c.Version = 1;
|
||||
c.WebSearchEngine = WebSearchEngine.Google_Scrape;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,7 +1,9 @@
|
||||
namespace NadekoBot.Modules.Searches;
|
||||
// ReSharper disable InconsistentNaming
|
||||
namespace NadekoBot.Modules.Searches;
|
||||
|
||||
public enum WebSearchEngine
|
||||
{
|
||||
Google,
|
||||
Google_Scrape,
|
||||
Searx,
|
||||
}
|
@@ -1,9 +1,12 @@
|
||||
# DO NOT CHANGE
|
||||
version: 1
|
||||
# Which engine should .search command
|
||||
# 'google' requires googleApiKey and google.searchId set in creds.yml
|
||||
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
|
||||
webSearchEngine: Google
|
||||
# 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
|
||||
# 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
|
||||
# 'searx' - requires at least one searx instance specified in the 'searxInstances' property below
|
||||
webSearchEngine: Google_Scrape
|
||||
# Which engine should .image command use
|
||||
# 'google' requires googleApiKey and google.imageSearchId set in creds.yml
|
||||
# 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
|
||||
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
|
||||
imgSearchEngine: Google
|
||||
# Which search provider will be used for the `.youtube` command.
|
||||
|
Reference in New Issue
Block a user