Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
jianghuyihei committed Oct 12, 2024
1 parent 6f68607 commit cf2bf33
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 63 deletions.
Binary file modified .DS_Store
Binary file not shown.
7 changes: 3 additions & 4 deletions agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ async def deep_research_paper_with_chain(self,paper:Result):
# search before
while len(idea_chain)<self.max_chain_length:
rerank_query = f"{self.topic} {current_title} {current_abstract}"
citation_paper = await self.reader.search_paper_cited_by_async(current_title,rerank_query=rerank_query,llm=self.llm,paper_list=idea_papers)
citation_paper = await self.reader.search_related_paper_async(current_title,need_reference=False,rerank_query=rerank_query,llm=self.llm,paper_list=idea_papers)
if not citation_paper:
break

Expand Down Expand Up @@ -364,10 +364,9 @@ async def deep_research_paper_with_chain(self,paper:Result):

if not article:
rerank_query = f"topic: {self.topic} Title: {current_title} Abstract: {current_abstract}"
search_paper = await self.reader.search_related_paper_async(current_title,need_citation=False,rerank_query = rerank_query,llm=self.llm,paper_list=idea_papers)
if len(search_paper) == 0:
s_p = await self.reader.search_related_paper_async(current_title,need_citation=False,rerank_query = rerank_query,llm=self.llm,paper_list=idea_papers)
if not s_p:
continue
s_p = search_paper[0]
if len(idea_chain) < self.min_chain_length:
article = await self.reader.read_arxiv_from_link_async(s_p.pdf_link,f"{s_p.title}.pdf")
if not article:
Expand Down
79 changes: 20 additions & 59 deletions searcher/sementic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,28 @@ def extract(text, type):


async def fetch(url):
time.sleep(1)
await asyncio.sleep(1)
try:
timeout = aiohttp.ClientTimeout(total=180)
timeout = aiohttp.ClientTimeout(total=120)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url) as response:
if response.status == 200:
content = await response.read() # Read the response content as bytes
return content
else:
await asyncio.sleep(0.01)
print(f"Failed to fetch the URL: {url}")
print(f"Failed to fetch the URL: {url} with status code: {response.status}")
return None
except Exception as e:
except aiohttp.ClientError as e: # 更具体的异常捕获
await asyncio.sleep(0.01)
print(f"An error occurred while fetching the URL: {url}")
print(e)
return None
except Exception as e:
await asyncio.sleep(0.01)
print(f"An unexpected error occurred while fetching the URL: {url}")
print(e)
return None

class Result:
def __init__(self,title="",abstract="",pdf_link="",citations_conut = 0,year = None) -> None:
Expand Down Expand Up @@ -300,65 +305,16 @@ async def search_async(self,query,max_results = 5 ,paper_list = None ,rerank_que
if len(final_results) >= max_results:
break
return final_results

async def search_paper_cited_by_async(self,title,rerank_query,llm,paper_list = []):
print(f"Searching for papers cited by <{title}>")
results = await self.search_papers_async(title,limit = 5)
citations = []
if not results or "data" not in results:
return None
for result in results["data"]:
for citation in result["citations"]:
if os.path.exists(os.path.join(self.save_file, f"{citation['title']}.pdf")) and citation["title"] not in paper_list:
citations.append(citation)
elif citation["title"] in self.ban_paper or citation["isOpenAccess"] == False or citation["openAccessPdf"] == None or citation["abstract"] == None or citation["title"] in paper_list:
continue
else:
citations.append(citation)

if len(citations) >= 200:
import random
citations = random.sample(citations,50)

if llm and rerank_query:
rerank_query_embbeding = llm.get_embbeding(rerank_query)
rerank_query_embbeding = np.array(rerank_query_embbeding)
citations = await self.rerank_papers_async(rerank_query_embbeding, citations,llm)
citations = [citation[0] for citation in citations]

relevant_citation = None

for citation in citations:
if os.path.exists(os.path.join(self.save_file, f"{citation['title']}.pdf")):
relevant_citation = citation
break
flag = await self.download_pdf_async(citation["openAccessPdf"]["url"],f"{citation['title']}.pdf")
if flag:
citation["pdf_link"] = f"{citation['title']}.pdf"
relevant_citation = citation
break
if not relevant_citation:
for citation in citations:
print(citation["title"],end="")
print(f"No relevant citation found for the paper <{title}>")
return None
title = relevant_citation["title"] if "title" in relevant_citation else ""
abstract = relevant_citation["abstract"] if "abstract" in relevant_citation else ""
pdf_link = relevant_citation["pdf_link"] if "pdf_link" in relevant_citation else ""
citationCount = relevant_citation["citationCount"] if "citationCount" in relevant_citation else 0
year = relevant_citation["year"] if "year" in relevant_citation else None
result = Result(title,abstract,pdf_link,citationCount,year)
return result


async def search_related_paper_async(self,title,need_citation = True,need_reference = True,rerank_query = None,llm = None,paper_list = []):
print(f"Searching for papers related to <{title}>")
print(f"Searching for related papers of paper <{title}>; Citation:{need_citation}; Reference:{need_reference}")
fileds = ["title","abstract","citations.title","citations.abstract","citations.citationCount","references.title","references.abstract","references.citationCount","citations.isOpenAccess","citations.openAccessPdf","references.isOpenAccess","references.openAccessPdf","citations.year","references.year"]
results = await self.search_papers_async(title,limit = 3,fields=fileds)
related_papers = []
related_papers_title = []
if not results or "data" not in results:
return []
print(f"Failed to find related papers of paper <{title}>; Citation:{need_citation}; Reference:{need_reference}")
return None
for result in results["data"]:
if not result:
continue
Expand Down Expand Up @@ -403,9 +359,14 @@ async def search_related_paper_async(self,title,need_citation = True,need_refere
related_papers = [[paper["title"],paper["abstract"],paper["openAccessPdf"]["url"],paper["citationCount"],paper['year']] for paper in related_papers]
related_papers = sorted(related_papers,key = lambda x: x[3],reverse = True)
related_papers = [Result(paper[0],paper[1],paper[2],paper[3]) for paper in related_papers]
if len(related_papers) >= 10:
related_papers = related_papers[:10]
return related_papers

for paper in related_papers:
url = paper.pdf_link
article = await self.read_arxiv_from_link_async(url, f"{paper.title}.pdf")
if article:
return paper
print(f"Failed to find related papers of paper <{title}>; Citation:{need_citation}; Reference:{need_reference}")
return None


async def search_paper_for_abstract_async(self,query,fields = ["title","abstract"]):
Expand Down

0 comments on commit cf2bf33

Please sign in to comment.