Skip to content

Commit

Permalink
Add agent component for web crawler (#2878)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Add agent component for  web crawler

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
wingjson authored Oct 21, 2024
1 parent c1d0473 commit 4bdf3fd
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 0 deletions.
1 change: 1 addition & 0 deletions agent/component/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .jin10 import Jin10, Jin10Param
from .tushare import TuShare, TuShareParam
from .akshare import AkShare, AkShareParam
from .crawler import Crawler, CrawlerParam


def component_class(class_name):
Expand Down
71 changes: 71 additions & 0 deletions agent/component/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from abc import ABC
import asyncio
from crawl4ai import AsyncWebCrawler
from agent.component.base import ComponentBase, ComponentParamBase

class CrawlerParam(ComponentParamBase):
"""
Define the Crawler component parameters.
"""

def __init__(self):
super().__init__()

def check(self):
return True


class Crawler(ComponentBase, ABC):
component_name = "Crawler"

def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return Crawler.be_output("")
try:
result = asyncio.run(self.get_web(ans))

return Crawler.be_output(result)

except Exception as e:
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")


async def get_web(self, url):
proxy = self._param.proxy if self._param.proxy else None
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
result = await crawler.arun(
url=url,
bypass_cache=True
)

match self._param.extract_type:
case 'html':
return result.cleaned_html
case 'markdown':
return result.markdown
case 'content':
return result.extracted_content
case _:
return result.markdown
# print(result.markdown)




1 change: 1 addition & 0 deletions web/src/assets/svg/crawler.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 10 additions & 0 deletions web/src/locales/en.ts
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
yahooFinance: 'YahooFinance',
yahooFinanceDescription:
'The component queries information about the company based on the provided ticker symbol.',
crawler: 'Web Crawler',
crawlerDescription:
'This component can be used to crawl HTML source code from a specified URL.',
proxy: 'Proxy',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: 'Content',
},
extractType: 'extractType',
info: 'Info',
history: 'History',
financials: 'Financials',
Expand Down
9 changes: 9 additions & 0 deletions web/src/locales/zh-traditional.ts
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,15 @@ export default {
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
yahooFinance: '雅虎財經',
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
crawler: '網頁爬蟲',
crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
proxy: '代理',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: '文本',
},
extractType: '提取類型',
info: '訊息',
history: '歷史',
financials: '財務',
Expand Down
9 changes: 9 additions & 0 deletions web/src/locales/zh.ts
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,15 @@ export default {
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
yahooFinance: '雅虎财经',
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
crawler: '网页爬虫',
crawlerDescription: '该组件可用于从指定url爬取html源码。',
proxy: '代理',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: '文本',
},
extractType: '提取类型',
info: '信息',
history: '历史',
financials: '财务',
Expand Down
12 changes: 12 additions & 0 deletions web/src/pages/flow/constant.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
Expand Down Expand Up @@ -73,6 +74,7 @@ export enum Operator {
Concentrator = 'Concentrator',
TuShare = 'TuShare',
Note = 'Note',
Crawler = 'Crawler',
}

export const CommonOperatorList = Object.values(Operator).filter(
Expand Down Expand Up @@ -110,6 +112,7 @@ export const operatorIconMap = {
[Operator.Concentrator]: ConcentratorIcon,
[Operator.TuShare]: TuShareIcon,
[Operator.Note]: NoteIcon,
[Operator.Crawler]: CrawlerIcon,
};

export const operatorMap: Record<
Expand Down Expand Up @@ -233,6 +236,9 @@ export const operatorMap: Record<
},
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
[Operator.Note]: { backgroundColor: '#f8cfa0' },
[Operator.Crawler]: {
backgroundColor: '#dee0e2',
},
};

export const componentMenuList = [
Expand Down Expand Up @@ -323,6 +329,9 @@ export const componentMenuList = [
{
name: Operator.TuShare,
},
{
name: Operator.Crawler,
},
];

export const initialRetrievalValues = {
Expand Down Expand Up @@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
[Operator.Jin10]: [Operator.Begin],
[Operator.Concentrator]: [Operator.Begin],
[Operator.TuShare]: [Operator.Begin],
[Operator.Crawler]: [Operator.Begin],
};

export const NodeMap = {
Expand Down Expand Up @@ -605,6 +615,7 @@ export const NodeMap = {
[Operator.Jin10]: 'ragNode',
[Operator.TuShare]: 'ragNode',
[Operator.Note]: 'noteNode',
[Operator.Crawler]: 'ragNode',
};

export const LanguageOptions = [
Expand Down Expand Up @@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
'fenghuang',
'jinrongjie',
];
export const CrawlerResultOptions = ['markdown', 'html', 'content'];
2 changes: 2 additions & 0 deletions web/src/pages/flow/flow-drawer/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
import BeginForm from '../form/begin-form';
import BingForm from '../form/bing-form';
import CategorizeForm from '../form/categorize-form';
import CrawlerForm from '../form/crawler-form';
import DeepLForm from '../form/deepl-form';
import DuckDuckGoForm from '../form/duckduckgo-form';
import ExeSQLForm from '../form/exesql-form';
Expand Down Expand Up @@ -70,6 +71,7 @@ const FormMap = {
[Operator.YahooFinance]: YahooFinanceForm,
[Operator.Jin10]: Jin10Form,
[Operator.TuShare]: TuShareForm,
[Operator.Crawler]: CrawlerForm,
};

const EmptyContent = () => <div>empty</div>;
Expand Down
37 changes: 37 additions & 0 deletions web/src/pages/flow/form/crawler-form/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { useTranslate } from '@/hooks/common-hooks';
import { Form, Input, Select } from 'antd';
import { useMemo } from 'react';
import { CrawlerResultOptions } from '../../constant';
import { IOperatorForm } from '../../interface';
const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
const { t } = useTranslate('flow');
const crawlerResultOptions = useMemo(() => {
return CrawlerResultOptions.map((x) => ({
value: x,
label: t(`crawlerResultOptions.${x}`),
}));
}, [t]);
return (
<Form
name="basic"
labelCol={{ span: 6 }}
wrapperCol={{ span: 18 }}
autoComplete="off"
form={form}
onValuesChange={onValuesChange}
>
<Form.Item label={t('proxy')} name={'proxy'}>
<Input placeholder="like: http://127.0.0.1:8888"></Input>
</Form.Item>
<Form.Item
label={t('extractType')}
name={'extract_type'}
initialValue="markdown"
>
<Select options={crawlerResultOptions}></Select>
</Form.Item>
</Form>
);
};

export default CrawlerForm;

0 comments on commit 4bdf3fd

Please sign in to comment.