mirror of
https://gitlab.com/Kwoth/nadekobot.git
synced 2025-09-10 17:28:27 -04:00
Re-added .google search result scraper and set it as the default again. Also added versioning to searches.yml as it was forgotten previously
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
using NadekoBot.Modules.Searches.Youtube;
|
using NadekoBot.Modules.Searches.GoogleScrape;
|
||||||
|
using NadekoBot.Modules.Searches.Youtube;
|
||||||
|
|
||||||
namespace NadekoBot.Modules.Searches;
|
namespace NadekoBot.Modules.Searches;
|
||||||
|
|
||||||
@@ -12,10 +13,12 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
|
|||||||
private readonly YtdlYoutubeSearchService _ytdl;
|
private readonly YtdlYoutubeSearchService _ytdl;
|
||||||
private readonly YoutubeDataApiSearchService _ytdata;
|
private readonly YoutubeDataApiSearchService _ytdata;
|
||||||
private readonly InvidiousYtSearchService _iYtSs;
|
private readonly InvidiousYtSearchService _iYtSs;
|
||||||
|
private readonly GoogleScrapeService _gscs;
|
||||||
|
|
||||||
public DefaultSearchServiceFactory(
|
public DefaultSearchServiceFactory(
|
||||||
SearchesConfigService scs,
|
SearchesConfigService scs,
|
||||||
GoogleSearchService gss,
|
GoogleSearchService gss,
|
||||||
|
GoogleScrapeService gscs,
|
||||||
SearxSearchService sss,
|
SearxSearchService sss,
|
||||||
YtdlpYoutubeSearchService ytdlp,
|
YtdlpYoutubeSearchService ytdlp,
|
||||||
YtdlYoutubeSearchService ytdl,
|
YtdlYoutubeSearchService ytdl,
|
||||||
@@ -25,6 +28,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
|
|||||||
_scs = scs;
|
_scs = scs;
|
||||||
_sss = sss;
|
_sss = sss;
|
||||||
_gss = gss;
|
_gss = gss;
|
||||||
|
_gscs = gscs;
|
||||||
_iYtSs = iYtSs;
|
_iYtSs = iYtSs;
|
||||||
|
|
||||||
_ytdlp = ytdlp;
|
_ytdlp = ytdlp;
|
||||||
@@ -36,6 +40,7 @@ public sealed class DefaultSearchServiceFactory : ISearchServiceFactory, INServi
|
|||||||
=> _scs.Data.WebSearchEngine switch
|
=> _scs.Data.WebSearchEngine switch
|
||||||
{
|
{
|
||||||
WebSearchEngine.Google => _gss,
|
WebSearchEngine.Google => _gss,
|
||||||
|
WebSearchEngine.Google_Scrape => _gscs,
|
||||||
WebSearchEngine.Searx => _sss,
|
WebSearchEngine.Searx => _sss,
|
||||||
_ => _gss
|
_ => _gss
|
||||||
};
|
};
|
||||||
|
@@ -39,7 +39,7 @@ public sealed class GoogleSearchService : SearchServiceBase, INService
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string query)
|
public override async ITask<GoogleCustomSearchResult?> SearchAsync(string? query)
|
||||||
{
|
{
|
||||||
ArgumentNullException.ThrowIfNull(query);
|
ArgumentNullException.ThrowIfNull(query);
|
||||||
|
|
||||||
|
@@ -1,62 +1,121 @@
|
|||||||
// using AngleSharp.Html.Dom;
|
using AngleSharp.Html.Dom;
|
||||||
// using MorseCode.ITask;
|
using AngleSharp.Html.Parser;
|
||||||
// using NadekoBot.Modules.Searches.Common;
|
using MorseCode.ITask;
|
||||||
//
|
|
||||||
// namespace NadekoBot.Modules.Searches.GoogleScrape;
|
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||||
//
|
|
||||||
// public sealed class GoogleScrapeService : SearchServiceBase
|
public sealed class GoogleScrapeService : SearchServiceBase, INService
|
||||||
// {
|
{
|
||||||
// public override async ITask<GoogleSearchResultData> SearchAsync(string query)
|
private static readonly HtmlParser _googleParser = new(new()
|
||||||
// {
|
{
|
||||||
// ArgumentNullException.ThrowIfNull(query);
|
IsScripting = false,
|
||||||
//
|
IsEmbedded = false,
|
||||||
// query = Uri.EscapeDataString(query)?.Replace(' ', '+');
|
IsSupportingProcessingInstructions = false,
|
||||||
//
|
IsKeepingSourceReferences = false,
|
||||||
// var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
|
IsNotSupportingFrames = true
|
||||||
//
|
});
|
||||||
// using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
|
|
||||||
// msg.Headers.Add("User-Agent",
|
|
||||||
// "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
|
private readonly IHttpClientFactory _httpFactory;
|
||||||
// msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
|
|
||||||
//
|
public GoogleScrapeService(IHttpClientFactory httpClientFactory)
|
||||||
// using var http = _httpFactory.CreateClient();
|
=> _httpFactory = httpClientFactory;
|
||||||
// http.DefaultRequestHeaders.Clear();
|
|
||||||
//
|
public override async ITask<ISearchResult?> SearchAsync(string? query)
|
||||||
// using var response = await http.SendAsync(msg);
|
{
|
||||||
// await using var content = await response.Content.ReadAsStreamAsync();
|
ArgumentNullException.ThrowIfNull(query);
|
||||||
//
|
|
||||||
// using var document = await _googleParser.ParseDocumentAsync(content);
|
query = Uri.EscapeDataString(query)?.Replace(' ', '+');
|
||||||
// var elems = document.QuerySelectorAll("div.g > div > div");
|
|
||||||
//
|
var fullQueryLink = $"https://www.google.ca/search?q={query}&safe=on&lr=lang_eng&hl=en&ie=utf-8&oe=utf-8";
|
||||||
// var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault();
|
|
||||||
// var totalResults = resultsElem?.TextContent;
|
using var msg = new HttpRequestMessage(HttpMethod.Get, fullQueryLink);
|
||||||
// //var time = resultsElem.Children.FirstOrDefault()?.TextContent
|
msg.Headers.Add("User-Agent",
|
||||||
// //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36");
|
||||||
// if (!elems.Any())
|
msg.Headers.Add("Cookie", "CONSENT=YES+shp.gws-20210601-0-RC2.en+FX+423;");
|
||||||
// return default;
|
|
||||||
//
|
using var http = _httpFactory.CreateClient();
|
||||||
// var results = elems.Select(elem =>
|
http.DefaultRequestHeaders.Clear();
|
||||||
// {
|
|
||||||
// var children = elem.Children.ToList();
|
using var response = await http.SendAsync(msg);
|
||||||
// if (children.Count < 2)
|
await using var content = await response.Content.ReadAsStreamAsync();
|
||||||
// return null;
|
|
||||||
//
|
using var document = await _googleParser.ParseDocumentAsync(content);
|
||||||
// var href = (children[0].QuerySelector("a") as IHtmlAnchorElement)?.Href;
|
var elems = document.QuerySelectorAll("div.g, div.mnr-c > div > div");
|
||||||
// var name = children[0].QuerySelector("h3")?.TextContent;
|
|
||||||
//
|
var resultsElem = document.QuerySelector("#result-stats");
|
||||||
// if (href is null || name is null)
|
var resultsArr = resultsElem?.TextContent.Split("results");
|
||||||
// return null;
|
var totalResults = resultsArr?.Length is null or 0
|
||||||
//
|
? null
|
||||||
// var txt = children[1].TextContent;
|
: resultsArr[0];
|
||||||
//
|
|
||||||
// if (string.IsNullOrWhiteSpace(txt))
|
var time = resultsArr is null or {Length: < 2}
|
||||||
// return null;
|
? null
|
||||||
//
|
: resultsArr[1]
|
||||||
// return new GoogleSearchResult(name, href, txt);
|
.Replace("(", string.Empty)
|
||||||
// })
|
.Replace("seconds)", string.Empty);
|
||||||
// .Where(x => x is not null)
|
|
||||||
// .ToList();
|
//var time = resultsElem.Children.FirstOrDefault()?.TextContent
|
||||||
//
|
//^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
|
||||||
// return new(results.AsReadOnly(), fullQueryLink, totalResults);
|
if (!elems.Any())
|
||||||
// }
|
return default;
|
||||||
// }
|
|
||||||
|
var results = elems.Select(elem =>
|
||||||
|
{
|
||||||
|
var aTag = elem.QuerySelector("a");
|
||||||
|
|
||||||
|
if (aTag is null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
var url = ((IHtmlAnchorElement)aTag).Href;
|
||||||
|
var title = aTag.QuerySelector("h3")?.TextContent;
|
||||||
|
|
||||||
|
var txt = aTag.ParentElement
|
||||||
|
?.NextElementSibling
|
||||||
|
?.QuerySelector("span")
|
||||||
|
?.TextContent
|
||||||
|
.StripHtml()
|
||||||
|
?? elem
|
||||||
|
?.QuerySelectorAll("span")
|
||||||
|
.Skip(3)
|
||||||
|
.FirstOrDefault()
|
||||||
|
?.TextContent
|
||||||
|
.StripHtml();
|
||||||
|
// .Select(x => x.TextContent.StripHtml())
|
||||||
|
// .Join("\n");
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(url)
|
||||||
|
|| string.IsNullOrWhiteSpace(title)
|
||||||
|
|| string.IsNullOrWhiteSpace(txt))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return new PlainSearchResultEntry
|
||||||
|
{
|
||||||
|
Title = title,
|
||||||
|
Url = url,
|
||||||
|
DisplayUrl = url,
|
||||||
|
Description = txt
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.Where(x => x is not null)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
// return new GoogleSearchResult(results.AsReadOnly(), fullQueryLink, totalResults);
|
||||||
|
|
||||||
|
return new PlainGoogleScrapeSearchResult()
|
||||||
|
{
|
||||||
|
Answer = null,
|
||||||
|
Entries = results!,
|
||||||
|
Info = new PlainSearchResultInfo()
|
||||||
|
{
|
||||||
|
SearchTime = time ?? "?",
|
||||||
|
TotalResults = totalResults ?? "?"
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// someone can mr this
|
||||||
|
public override ITask<IImageSearchResult?> SearchImagesAsync(string query)
|
||||||
|
=> throw new NotSupportedException();
|
||||||
|
}
|
@@ -0,0 +1,8 @@
|
|||||||
|
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||||
|
|
||||||
|
public class PlainGoogleScrapeSearchResult : ISearchResult
|
||||||
|
{
|
||||||
|
public string? Answer { get; init; } = null!;
|
||||||
|
public IReadOnlyCollection<ISearchResultEntry> Entries { get; init; } = null!;
|
||||||
|
public ISearchResultInformation Info { get; init; } = null!;
|
||||||
|
}
|
@@ -0,0 +1,9 @@
|
|||||||
|
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||||
|
|
||||||
|
public sealed class PlainSearchResultEntry : ISearchResultEntry
|
||||||
|
{
|
||||||
|
public string Title { get; init; } = null!;
|
||||||
|
public string Url { get; init; } = null!;
|
||||||
|
public string DisplayUrl { get; init; } = null!;
|
||||||
|
public string? Description { get; init; } = null!;
|
||||||
|
}
|
@@ -0,0 +1,7 @@
|
|||||||
|
namespace NadekoBot.Modules.Searches.GoogleScrape;
|
||||||
|
|
||||||
|
public sealed class PlainSearchResultInfo : ISearchResultInformation
|
||||||
|
{
|
||||||
|
public string TotalResults { get; init; } = null!;
|
||||||
|
public string SearchTime { get; init; } = null!;
|
||||||
|
}
|
@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
|
|||||||
|
|
||||||
public interface ISearchService
|
public interface ISearchService
|
||||||
{
|
{
|
||||||
ITask<ISearchResult?> SearchAsync(string query);
|
ITask<ISearchResult?> SearchAsync(string? query);
|
||||||
ITask<IImageSearchResult?> SearchImagesAsync(string query);
|
ITask<IImageSearchResult?> SearchImagesAsync(string query);
|
||||||
}
|
}
|
@@ -4,6 +4,6 @@ namespace NadekoBot.Modules.Searches;
|
|||||||
|
|
||||||
public abstract class SearchServiceBase : ISearchService
|
public abstract class SearchServiceBase : ISearchService
|
||||||
{
|
{
|
||||||
public abstract ITask<ISearchResult?> SearchAsync(string query);
|
public abstract ITask<ISearchResult?> SearchAsync(string? query);
|
||||||
public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query);
|
public abstract ITask<IImageSearchResult?> SearchImagesAsync(string query);
|
||||||
}
|
}
|
@@ -25,7 +25,7 @@ public sealed class SearxSearchService : SearchServiceBase, INService
|
|||||||
return instances[_rng.Next(0, instances.Count)];
|
return instances[_rng.Next(0, instances.Count)];
|
||||||
}
|
}
|
||||||
|
|
||||||
public override async ITask<SearxSearchResult> SearchAsync(string query)
|
public override async ITask<SearxSearchResult> SearchAsync(string? query)
|
||||||
{
|
{
|
||||||
ArgumentNullException.ThrowIfNull(query);
|
ArgumentNullException.ThrowIfNull(query);
|
||||||
|
|
||||||
|
@@ -6,13 +6,17 @@ namespace NadekoBot.Modules.Searches;
|
|||||||
[Cloneable]
|
[Cloneable]
|
||||||
public partial class SearchesConfig : ICloneable<SearchesConfig>
|
public partial class SearchesConfig : ICloneable<SearchesConfig>
|
||||||
{
|
{
|
||||||
|
[Comment("DO NOT CHANGE")]
|
||||||
|
public int Version { get; set; } = 0;
|
||||||
|
|
||||||
[Comment(@"Which engine should .search command
|
[Comment(@"Which engine should .search command
|
||||||
'google' requires googleApiKey and google.searchId set in creds.yml
|
'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
|
||||||
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
|
'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
|
||||||
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google;
|
'searx' - requires at least one searx instance specified in the 'searxInstances' property below")]
|
||||||
|
public WebSearchEngine WebSearchEngine { get; set; } = WebSearchEngine.Google_Scrape;
|
||||||
|
|
||||||
[Comment(@"Which engine should .image command use
|
[Comment(@"Which engine should .image command use
|
||||||
'google' requires googleApiKey and google.imageSearchId set in creds.yml
|
'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
|
||||||
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
|
'searx' requires at least one searx instance specified in the 'searxInstances' property below")]
|
||||||
public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google;
|
public ImgSearchEngine ImgSearchEngine { get; set; } = ImgSearchEngine.Google;
|
||||||
|
|
||||||
|
@@ -27,5 +27,19 @@ public class SearchesConfigService : ConfigServiceBase<SearchesConfig>
|
|||||||
sc => sc.YtProvider,
|
sc => sc.YtProvider,
|
||||||
ConfigParsers.InsensitiveEnum,
|
ConfigParsers.InsensitiveEnum,
|
||||||
ConfigPrinters.ToString);
|
ConfigPrinters.ToString);
|
||||||
|
|
||||||
|
Migrate();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void Migrate()
|
||||||
|
{
|
||||||
|
if (data.Version < 1)
|
||||||
|
{
|
||||||
|
ModifyConfig(c =>
|
||||||
|
{
|
||||||
|
c.Version = 1;
|
||||||
|
c.WebSearchEngine = WebSearchEngine.Google_Scrape;
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -1,7 +1,9 @@
|
|||||||
namespace NadekoBot.Modules.Searches;
|
// ReSharper disable InconsistentNaming
|
||||||
|
namespace NadekoBot.Modules.Searches;
|
||||||
|
|
||||||
public enum WebSearchEngine
|
public enum WebSearchEngine
|
||||||
{
|
{
|
||||||
Google,
|
Google,
|
||||||
|
Google_Scrape,
|
||||||
Searx,
|
Searx,
|
||||||
}
|
}
|
@@ -1,9 +1,12 @@
|
|||||||
|
# DO NOT CHANGE
|
||||||
|
version: 1
|
||||||
# Which engine should .search command
|
# Which engine should .search command
|
||||||
# 'google' requires googleApiKey and google.searchId set in creds.yml
|
# 'google_scrape' - default. Scrapes the webpage for results. May break. Requires no api keys.
|
||||||
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
|
# 'google' - official google api. Requires googleApiKey and google.searchId set in creds.yml
|
||||||
webSearchEngine: Google
|
# 'searx' - requires at least one searx instance specified in the 'searxInstances' property below
|
||||||
|
webSearchEngine: Google_Scrape
|
||||||
# Which engine should .image command use
|
# Which engine should .image command use
|
||||||
# 'google' requires googleApiKey and google.imageSearchId set in creds.yml
|
# 'google'- official google api. googleApiKey and google.imageSearchId set in creds.yml
|
||||||
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
|
# 'searx' requires at least one searx instance specified in the 'searxInstances' property below
|
||||||
imgSearchEngine: Google
|
imgSearchEngine: Google
|
||||||
# Which search provider will be used for the `.youtube` command.
|
# Which search provider will be used for the `.youtube` command.
|
||||||
|
Reference in New Issue
Block a user