Skip to content

wikipedia URL input #424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 24 additions & 27 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,34 +129,31 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
success_count=0
failed_count=0
lst_file_name=[]
queries_list = wiki_query.split(',')
wiki_query_ids, languages = check_url_source(source_type=source_type, queries_list=queries_list)
for query,language in zip(wiki_query_ids, languages):
logging.info(f"Creating source node for {query.strip()}, {language}")
pages = WikipediaLoader(query=query.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load()
try:
if not pages:
failed_count+=1
continue
obj_source_node = sourceNode()
obj_source_node.file_name = query.strip()
obj_source_node.file_type = 'text'
obj_source_node.file_source = source_type
obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
obj_source_node.total_pages = len(pages)
obj_source_node.model = model
obj_source_node.url = urllib.parse.unquote(pages[0].metadata['source'])
obj_source_node.created_at = datetime.now()
obj_source_node.language = language
graphDb_data_Access = graphDBdataAccess(graph)
graphDb_data_Access.create_source_node(obj_source_node)
success_count+=1
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Success'})
except Exception as e:
failed_count+=1
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Failed'})
#queries_list = wiki_query.split(',')
wiki_query_id, language = check_url_source(source_type=source_type, wiki_query=wiki_query)
logging.info(f"Creating source node for {wiki_query_id.strip()}, {language}")
pages = WikipediaLoader(query=wiki_query_id.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load()
if pages==None or len(pages)==0:
failed_count+=1
message = f"Unable to read data for given Wikipedia url : {wiki_query}"
raise Exception(message)
else:
obj_source_node = sourceNode()
obj_source_node.file_name = wiki_query_id.strip()
obj_source_node.file_type = 'text'
obj_source_node.file_source = source_type
obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
obj_source_node.total_pages = len(pages)
obj_source_node.model = model
obj_source_node.url = urllib.parse.unquote(pages[0].metadata['source'])
obj_source_node.created_at = datetime.now()
obj_source_node.language = language
graphDb_data_Access = graphDBdataAccess(graph)
graphDb_data_Access.create_source_node(obj_source_node)
success_count+=1
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Success'})
return lst_file_name,success_count,failed_count

def extract_graph_from_file_local_file(graph, model, fileName, merged_file_path, allowedNodes, allowedRelationship):

logging.info(f'Process file name :{fileName}')
Expand Down
30 changes: 15 additions & 15 deletions backend/src/shared/common_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,36 @@

#watch("neo4j")

def check_url_source(source_type, yt_url:str=None, queries_list:List[str]=None):
languages=[]
def check_url_source(source_type, yt_url:str=None, wiki_query:str=None):
language=''
try:
logging.info(f"incoming URL: {yt_url}")
if source_type == 'youtube':
if re.match('(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?',yt_url.strip()):
youtube_url = create_youtube_url(yt_url.strip())
logging.info(youtube_url)
return youtube_url,languages
return youtube_url,language
else:
raise Exception('Incoming URL is not youtube URL')

elif source_type == 'Wikipedia':
wiki_query_ids=[]
wiki_query_id=''
#pattern = r"https?:\/\/([a-zA-Z0-9\.\,\_\-\/]+)\.wikipedia\.([a-zA-Z]{2,3})\/wiki\/([a-zA-Z0-9\.\,\_\-\/]+)"
wikipedia_url_regex = r'https?:\/\/(www\.)?([a-zA-Z]{2,3})\.wikipedia\.org\/wiki\/(.*)'
wiki_id_pattern = r'^[a-zA-Z0-9 _\-\.\,\:\(\)\[\]\{\}\/]*$'

for wiki_url in queries_list:
match = re.search(wikipedia_url_regex, wiki_url.strip())
if match:
languages.append(match.group(2))
wiki_query_ids.append(match.group(3))
else :
languages.append("en")
wiki_query_ids.append(wiki_url.strip())

match = re.search(wikipedia_url_regex, wiki_query.strip())
if match:
language = match.group(2)
wiki_query_id = match.group(3)
# else :
# languages.append("en")
# wiki_query_ids.append(wiki_url.strip())
else:
raise Exception(f'Not a valid wikipedia url: {wiki_query} ')

logging.info(f"wikipedia query ids = {wiki_query_ids}")
return wiki_query_ids, languages
logging.info(f"wikipedia query id = {wiki_query_id}")
return wiki_query_id, language
except Exception as e:
logging.error(f"Error in recognize URL: {e}")
raise Exception(e)
Expand Down
22 changes: 18 additions & 4 deletions frontend/src/App.css
Original file line number Diff line number Diff line change
Expand Up @@ -278,23 +278,34 @@
width: 100%;
}

@container (min-height:500px) and (max-height:600px) {
@container (min-height:500px) and (max-height:700px) {
.outline-dashed img {
width: 45px;
width: 40px;
height: auto;
}

.outline-dashed h6 {
font-size: 14px;
}

}

@container (min-height:300px) and (max-height:500px) {
@container (min-height:400px) and (max-height:500px) {
.outline-dashed img {
width: 35px;
height: auto;
}

.outline-dashed h6 {
font-size: 14px;
}

}
@container (max-height:300px) {
.outline-dashed img {
width: 30px;
height: auto;
}

.outline-dashed h6 {
font-size: 12px;
}
Expand All @@ -308,4 +319,7 @@
.imageBg >div{
padding: 5px;
}
}
.ndl-dropzone .ndl-dropzone-header{
margin-bottom: 0 !important;
}
4 changes: 2 additions & 2 deletions frontend/src/components/DropZone.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ const DropZone: FunctionComponent = () => {
return {
...curfile,
status: 'Failed',
type: curfile.type?.split('/')[1]?.toUpperCase() ?? 'PDF',
type: `${file.name.substring(file.name.lastIndexOf('.') + 1, file.name.length).toUpperCase()}`,
};
}
return curfile;
Expand Down Expand Up @@ -228,7 +228,7 @@ const DropZone: FunctionComponent = () => {
className='!bg-none dropzoneContainer'
supportedFilesDescription={
<Typography variant='body-small'>
<Flex>
<Flex gap='0'>
<span>Documents, Images, Unstructured</span>
<div className='align-self-center'>
<IconButtonWithToolTip
Expand Down
38 changes: 27 additions & 11 deletions frontend/src/components/WikipediaModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,25 @@ import { useFileContext } from '../context/UsersFiles';
import { v4 as uuidv4 } from 'uuid';
import { useCredentials } from '../context/UserCredentials';
import { urlScanAPI } from '../services/URLScan';
import { wikiValidation } from '../utils/Utils';

const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
const [wikiQuery, setwikiQuery] = useState<string>('');
const [statusMessage, setStatusMessage] = useState<string>('');
const [status, setStatus] = useState<'unknown' | 'success' | 'info' | 'warning' | 'danger'>('unknown');
const { setFilesData, model, filesData } = useFileContext();
const { userCredentials } = useCredentials();
const [isFocused, setisFocused] = useState<boolean>(false);
const [isValid, setValid] = useState<boolean>(false);
const onClose = useCallback(() => {
hideModal();
setwikiQuery('');
setStatus('unknown');
setValid(false)
setisFocused(false)
}, []);

const submitHandler = async () => {
const submitHandler = async (url: string) => {
const defaultValues: CustomFileBase = {
processing: 0,
status: 'New',
Expand All @@ -30,7 +35,10 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
fileSource: 'Wikipedia',
processingProgress: undefined,
};
if (wikiQuery.length) {
if (url.trim() != '') {
setValid(wikiValidation(url) && isFocused);
}
if (isValid) {
try {
setStatus('info');
setStatusMessage('Scanning...');
Expand All @@ -47,6 +55,8 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
setTimeout(() => {
setStatus('unknown');
setwikiQuery('');
setValid(false)
setisFocused(false)
hideModal();
}, 5000);
return;
Expand All @@ -56,13 +66,13 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
if (apiResCheck) {
setStatus('info');
setStatusMessage(
`Successfully Created Source Nodes for ${apiResponse.data.success_count} and Failed for ${apiResponse.data.failed_count} Wikipedia Sources`
`Successfully Created Source Node for ${apiResponse.data.success_count} and Failed for ${apiResponse.data.failed_count} Wikipedia Link`
);
} else if (apiResponse?.data?.success_count) {
setStatusMessage(`Successfully Created Source Nodes for ${apiResponse.data.success_count} Wikipedia Sources`);
setStatusMessage(`Successfully Created Source Node for ${apiResponse.data.success_count} Wikipedia Link`);
} else {
setStatus('danger');
setStatusMessage(`Failed to Create Source Nodes for ${apiResponse.data.failed_count} Wikipedia Sources`);
setStatusMessage(`Failed to Create Source Node for ${apiResponse.data.failed_count} Wikipedia Link`);
}

const copiedFilesData: CustomFile[] = [...filesData];
Expand Down Expand Up @@ -96,13 +106,15 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
});
setFilesData(copiedFilesData);
setwikiQuery('');
setValid(false);
setisFocused(false);
} catch (error) {
setStatus('danger');
setStatusMessage('Some Error Occurred or Please Check your Instance Connection');
}
} else {
setStatus('danger');
setStatusMessage('Please Fill the Wikipedia source');
setStatusMessage('Please Fill the Wikipedia Link');
setTimeout(() => {
setStatus('unknown');
}, 5000);
Expand All @@ -111,30 +123,34 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
setTimeout(() => {
setStatus('unknown');
hideModal();
}, 500);
}, 1000);
};
return (
<CustomModal
open={open}
onClose={onClose}
statusMessage={statusMessage}
setStatus={setStatus}
submitHandler={submitHandler}
submitHandler={() => submitHandler(wikiQuery)}
status={status}
submitLabel='Submit'
>
<div className='w-full inline-block'>
<TextInput
type='url'
id='keyword'
value={wikiQuery}
disabled={false}
label='Wikipedia Keywords'
aria-label='Wikipedia Keywords'
placeholder='Albert Einstein ,Isaac Newton'
label='Wikipedia Link'
aria-label='Wikipedia Link'
placeholder='https://en.wikipedia.org/wiki/Albert_Einstein'
autoFocus
fluid
required
onBlur={() => setValid(wikiValidation(wikiQuery) && isFocused)}
errorText={!isValid && isFocused && 'Please Fill The Valid URL'}
onChange={(e) => {
setisFocused(true);
setwikiQuery(e.target.value);
}}
/>
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/utils/Utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ export const validation = (url: string) => {
return url.trim() != '' && /^s3:\/\/([^/]+)\/?$/.test(url) != false;
};

export const wikiValidation = (url: string) => {
return url.trim() != '' && /https:\/\/([a-zA-Z]{2,3})\.wikipedia\.org\/wiki\/(.*)/gm.test(url) != false;
};
// Status indicator icons to status column
export const statusCheck = (status: string) => {
switch (status) {
Expand Down