Source code for sherpa_ai.tools
import re
import urllib.parse
from typing import Any, List, Tuple, Union
import requests
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_core.tools import BaseTool
from langchain_core.vectorstores import VectorStoreRetriever
from loguru import logger
from typing_extensions import Literal
import sherpa_ai.config as cfg
from sherpa_ai.config.task_config import AgentConfig
from sherpa_ai.scrape.extract_github_readme import extract_github_readme
from sherpa_ai.utils import (chunk_and_summarize, count_string_tokens,
get_links_from_text, rewrite_link_references,
scrape_with_url)
HTTP_GET_TIMEOUT = 2.5
[docs]
def get_tools(memory, config):
tools = []
# tools.append(ContextTool(memory=memory))
tools.append(UserInputTool())
if cfg.SERPER_API_KEY is not None:
search_tool = SearchTool(config=config)
tools.append(search_tool)
else:
logger.warning(
"No SERPER_API_KEY found in environment variables, skipping SearchTool"
)
return tools
[docs]
class SearchArxivTool(BaseTool):
name: str = "Arxiv Search"
description: str = (
"Access all the papers from Arxiv to search for domain-specific scientific publication." # noqa: E501
"Only use this tool when you need information in the scientific paper."
)
def _run(
self, query: str, return_resources=False
) -> Union[str, Tuple[str, List[dict]]]:
top_k = 10
logger.debug(f"Search query: {query}")
query = urllib.parse.quote_plus(query)
url = (
"http://export.arxiv.org/api/query?search_query=all:"
+ query.strip()
+ "&start=0&max_results="
+ str(top_k)
)
data = requests.get(url, timeout=HTTP_GET_TIMEOUT)
xml_content = data.text
summary_pattern = r"<summary>(.*?)</summary>"
summaries = re.findall(summary_pattern, xml_content, re.DOTALL)
title_pattern = r"<title>(.*?)</title>"
titles = re.findall(title_pattern, xml_content, re.DOTALL)
id_pattern = r"<id>(.*?)</id>"
ids = re.findall(id_pattern, xml_content, re.DOTALL)
result_list = []
for i in range(len(titles)):
result_list.append(
"Title: " + titles[i] + "\n" + "Summary: " + summaries[i] + "\n"
)
result = "\n".join(result_list)
# add resources for citation
resources = []
for i in range(len(titles)):
resources.append(
{
"Document": "Title: " + titles[i] + "\nSummary: " + summaries[i],
"Source": ids[i],
}
)
logger.debug(f"Arxiv Search Result: {result_list}")
if return_resources:
return resources
else:
return result
def _arun(self, query: str) -> str:
raise NotImplementedError("SearchArxivTool does not support async run")
[docs]
class SearchTool(BaseTool):
name: str = "Search"
config: AgentConfig = AgentConfig()
top_k: int = 10
description: str = (
"Access the internet to search for the information. Only use this tool when "
"you cannot find the information using internal search."
)
def _run(self, query: str, return_resources=False) -> Union[str, List[dict]]:
result = ""
if self.config.search_domains:
query_list = [
self.formulate_site_search(query, str(i))
for i in self.config.search_domains
]
if len(query_list) >= 5:
query_list = query_list[:5]
logger.warning("Only the first 5 URLs are taken into consideration.")
else:
query_list = [query]
if self.config.invalid_domains:
invalid_domain_string = ", ".join(self.config.invalid_domains)
logger.warning(
f"The domain {invalid_domain_string} is invalid and is not taken into consideration." # noqa: E501
)
top_k = int(self.top_k / len(query_list))
if return_resources:
resources = []
for query in query_list:
cur_result = self._run_single_query(query, top_k, return_resources)
if return_resources:
resources += cur_result
else:
result += "\n" + cur_result
if return_resources:
return resources
else:
return result
[docs]
def formulate_site_search(self, query: str, site: str) -> str:
return query + " site:" + site
def _run_single_query(
self, query: str, top_k: int, return_resources=False
) -> Union[str, List[dict]]:
logger.debug(f"Search query: {query}")
google_serper = GoogleSerperAPIWrapper()
search_results = google_serper._google_serper_api_results(query)
logger.debug(f"Google Search Result: {search_results}")
# case 1: answerBox in the result dictionary
if search_results.get("answerBox", False):
answer_box = search_results.get("answerBox", {})
if answer_box.get("answer"):
answer = answer_box.get("answer")
elif answer_box.get("snippet"):
answer = answer_box.get("snippet").replace("\n", " ")
elif answer_box.get("snippetHighlighted"):
answer = answer_box.get("snippetHighlighted")
title = search_results["organic"][0]["title"]
link = search_results["organic"][0]["link"]
response = "Answer: " + answer
meta = [{"Document": answer, "Source": link}]
if return_resources:
return meta
else:
return response + "\nLink:" + link
# case 2: knowledgeGraph in the result dictionary
snippets = []
if search_results.get("knowledgeGraph", False):
kg = search_results.get("knowledgeGraph", {})
title = kg.get("title")
entity_type = kg.get("type")
if entity_type:
snippets.append(f"{title}: {entity_type}.")
description = kg.get("description")
if description:
snippets.append(description)
for attribute, value in kg.get("attributes", {}).items():
snippets.append(f"{title} {attribute}: {value}.")
search_type: Literal["news", "search", "places", "images"] = "search"
result_key_for_type = {
"news": "news",
"places": "places",
"images": "images",
"search": "organic",
}
# case 3: general search results
for result in search_results[result_key_for_type[search_type]][:top_k]:
if "snippet" in result:
snippets.append(result["snippet"])
for attribute, value in result.get("attributes", {}).items():
snippets.append(f"{attribute}: {value}.")
if len(snippets) == 0:
if return_resources:
return []
else:
return "No good Google Search Result was found"
result = []
resources = []
for i in range(len(search_results["organic"][:top_k])):
r = search_results["organic"][i]
single_result = r["title"] + r["snippet"]
# If the links are not considered explicitly, add it to the search result
# so that it can be considered by the LLM
if not return_resources:
single_result += "\nLink:" + r["link"]
result.append(single_result)
resources.append(
{
"Document": "Description: " + r["title"] + r["snippet"],
"Source": r["link"],
}
)
full_result = "\n".join(result)
# answer = " ".join(snippets)
if (
"knowledgeGraph" in search_results
and "description" in search_results["knowledgeGraph"]
and "descriptionLink" in search_results["knowledgeGraph"]
):
answer = (
"Description: "
+ search_results["knowledgeGraph"]["title"]
+ search_results["knowledgeGraph"]["description"]
+ "\nLink:"
+ search_results["knowledgeGraph"]["descriptionLink"]
)
full_result = answer + "\n\n" + full_result
if return_resources:
return resources
else:
return full_result
def _arun(self, query: str) -> str:
raise NotImplementedError("SearchTool does not support async run")
[docs]
class ContextTool(BaseTool):
name: str = "Context Search"
description: str = (
"Access internal technical documentation for AI related projects, including"
+ "Fixie, LangChain, GPT index, GPTCache, GPT4ALL, autoGPT, db-GPT, AgentGPT, sherpa." # noqa: E501
+ "Only use this tool if you need information for these projects specifically."
)
memory: VectorStoreRetriever
def _run(
self, query: str, return_resources=False
) -> Union[str, Tuple[str, List[dict]]]:
docs = self.memory.get_relevant_documents(query)
result = ""
resources = []
for doc in docs:
result += (
"Document"
+ doc.page_content
+ "\nLink:"
+ doc.metadata.get("source", "")
+ "\n"
)
if return_resources:
resources.append(
{
"Document": doc.page_content,
"Source": doc.metadata.get("source", ""),
}
)
if return_resources:
return resources
else:
return result
def _arun(self, query: str) -> str:
raise NotImplementedError("ContextTool does not support async run")
[docs]
class UserInputTool(BaseTool):
# TODO: Make an action for the user input
name: str = "UserInput"
description: str = (
"Access the user input for the task."
"You use this tool if you need more context and would like to ask clarifying questions to solve the task" # noqa: E501
)
def _run(self, query: str) -> str:
return input(query)
def _arun(self, query: str) -> str:
raise NotImplementedError("UserInputTool does not support async run")
[docs]
class LinkScraperTool(BaseTool):
name: str = "Link Scraper"
description: str = "Access the content of a link. Only use this tool when you need to extract information from a link."
def _run(
self,
query: str,
llm: Any,
) -> str:
query_links = get_links_from_text(query)
# if there is a link inside the question scrape then summarize based
# on question and then aggregate to the question
if len(query_links) > 0:
# TODO I should get gpt-3.5-turbo from an environment variable or a config file
available_token = 3000 - count_string_tokens(query, "gpt-3.5-turbo")
per_scrape_token_size = available_token / len(query_links)
final_summary = []
for last_message_link in query_links:
link = last_message_link["url"]
scraped_data = ""
if "github" in query_links[-1]["base_url"]:
git_scraper = extract_github_readme(link)
if git_scraper:
scraped_data = {
"data": git_scraper,
"status": 200,
}
else:
scraped_data = {"data": "", "status": 404}
else:
scraped_data = scrape_with_url(link)
if scraped_data["status"] == 200:
chunk_summary = chunk_and_summarize(
link=link,
question=query,
text_data=scraped_data["data"],
# TODO_ user id is not going to be needed here in the future
# user_id="",
llm=llm,
)
while (
count_string_tokens(chunk_summary, "gpt-3.5-turbo")
> per_scrape_token_size
):
chunk_summary = chunk_and_summarize(
link=link,
question=query,
text_data=chunk_summary,
# user_id="",
llm=llm,
)
final_summary.append({"data": chunk_summary, "link": link})
else:
final_summary.append({"data": "Scraping failed", "link": link})
scraped_data = rewrite_link_references(question=query, data=final_summary)
resources = []
resources.append(
{
"Document": scraped_data,
"Source": ", ".join([link["url"] for link in query_links]),
}
)
return resources
def _arun(self, query: str) -> str:
raise NotImplementedError("LinkScraperTool does not support async run")