CS410Assignments · dasshims · Oct 27, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+__pycache__/*
+.idea
+.idea/*
+node_modules
+*.docx
+*.DS_Store
+*.iml
+*.log
+*.csv
+*.pyc
+*/subtitles.json
+docs/
diff --git a/ChatGPTQuerier/chat_coursera.py b/ChatGPTQuerier/chat_coursera.py
@@ -0,0 +1,44 @@
+import openai
+import os
+from langchain.chains import RetrievalQA
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import JSONLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.chroma import Chroma
+
+from dotenv import load_dotenv, find_dotenv
+
+
+_ = load_dotenv(find_dotenv())  # read local .env file
+loader = JSONLoader(
+    file_path='./chat_subtitles.json',
+    jq_schema='.filler[].text',
+    text_content=False)
+
+docs = loader.load()
+r_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=150,
+    chunk_overlap=0,
+    separators=["\n\n", "\n", "\. ", " ", ""]
+)
+trans_docs = r_splitter.split_documents(docs)
+
+# print(trans_docs)
+
+persist_directory = 'docs/chroma/'
+embedding = OpenAIEmbeddings()
+vectordb = Chroma(
+    persist_directory=persist_directory,
+    embedding_function=embedding
+)
+vectordb.add_documents(docs)
+
+
+llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
+
+qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever())
+while True:
+    question = input()
+    result = qa_chain({"query": question})
+    print(result["result"])
diff --git a/ChatGPTQuerier/chat_subtitles.json b/ChatGPTQuerier/chat_subtitles.json
@@ -0,0 +1,124 @@
+{
+    "filler": [
+        {
+            "time": "0:00",
+            "text": "[SOUND] Hello. Welcome to the course Text Mining and Analytics. My name is ChengXiang Zhai. I have a nickname, Cheng. I am a professor of the Department of Computer Science at the University of Illinois at Urbana-Champaign. This course is a part of a data mining specialization offered by the University of Illinois at Urbana-Champaign. In addition to this course, there are four other courses offered by",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "0:39",
+            "text": "Professor Jiawei Han, Professor John Hart and me, followed by a capstone project course that all of us will teach together.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "0:51",
+            "text": "This course is particularly related to another course in the specialization, mainly text retrieval and search engines in that both courses are about text data.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "1:07",
+            "text": "In contrast, pattern discovery and cluster analysis are about algorithms more applicable to all kinds of data in general. The visualization course is also relatively general in that the techniques can be applied to all kinds of data.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "1:28",
+            "text": "This course addresses a pressing need for harnessing big text data.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "1:35",
+            "text": "Text data has been growing dramatically recently, mostly because of the advance of technologies deployed on the web that would enable people to quickly generate text data.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "1:50",
+            "text": "So, I listed some of the examples on this slide",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "1:57",
+            "text": "that can show a variety of text data that are available today. For example, if you think about the data on the internet, on the web,",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "2:07",
+            "text": "everyday we are seeing many web pages being created.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "2:13",
+            "text": "Blogs are another kind of new text data that are being generated quickly by people. Anyone can write a blog article on the web. New articles of course have always been a main kind of text data that being generated everyday.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "2:31",
+            "text": "Emails are yet another kind of text data. And literature is also representing a large portion of text data. It's also especially very important because of the high quality in the data. That is, we encode our knowledge about the word using text data represented by all the literature articles. It's a vast amount of knowledge of",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "3:08",
+            "text": "all the text and data in these literature articles.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "3:14",
+            "text": "Twitter is another representative text data representing social media. Of course there are forums as well.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "3:24",
+            "text": "People are generating tweets very quickly indeed as we are speaking perhaps many people have already written many tweets. So, as you can see there are all kinds of text data that are being generated very quickly.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "3:38",
+            "text": "Now these text data present some challenges for people.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "3:43",
+            "text": "It's very hard for anyone to digest all the text data quickly. In particular, it's impossible for scientists to read all of the for example or for anyone to read all the tweets.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "4:01",
+            "text": "So there's a need for tools to help people digest text data more efficiently.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "4:09",
+            "text": "There is also another interesting opportunity provided by such big text data, and that is it's possible to leverage the amount of text data to discover interesting patterns to turn text data into actionable knowledge that can be useful for decision making.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "4:27",
+            "text": "So for example, product managers may be interested in knowing the feedback of customers about their products, knowing how well their products are being received as compared with the products of competitors. This can be a good opportunity for leveraging text data as we have seen a lot of reviews of product on the web. So if we can develop a master text mining techniques to tap into such a [INAUDIBLE] to extract the knowledge and opinions of people about these products, then we can help these product managers to gain business intelligence or to essentially feedback from their customers.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "5:18",
+            "text": "In scientific research, for example, scientists are interested in knowing the trends of research topics, knowing",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "5:29",
+            "text": "about what related fields have discovered. This problem is especially important in biology research as well. Different communities tend to use different terminologies, yet they're starting very similar problems. So how can we integrate the knowledge that is covered in different communities to help study a particular problem? It's very important, and it can speed up scientific discovery.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "5:57",
+            "text": "So there are many such examples where we can leverage the text data to discover useable knowledge to optimize our decision.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "6:06",
+            "text": "The main techniques for harnessing big text data are text retrieval and text mining. So these are two very much related technologies.Yet, they have somewhat different purposes. These two kinds of techniques are covered in the tool in this specialization. So, text retrieval on search engines covers text retrieval, and this is necessary to turn big text data into a much smaller but more relevant text data, which are often the data that we need to handle a particular problem or to optimize a particular decision. This course covers text mining which is a second step in this pipeline that can be used to further process the small amount of relevant data to extract the knowledge or to help people digest the text data easily. So the two courses are clearly related, in fact, some of the techniques are shared by both text retrieval and text mining. If you have already taken the text retrieval course, then you might see some of the content being repeated in this text mining course, although we'll be talking about the techniques from a very different perspective. If you have not taken the text retrieval course, it's also fine because this course is self-contained and you can certainly understand all of the materials without a problem. Of course, you might find it beneficial to take both courses and that will give you a very complete set of skills to handle big text data.",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        },
+        {
+            "time": "8:02",
+            "text": "[MUSIC]",
+            "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics"
+        }
+    ]
+}
diff --git a/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png b/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png
diff --git a/ChromeExtension/index.html b/ChromeExtension/index.html
@@ -0,0 +1,24 @@
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <meta charset="UTF-8" />
+        <meta http-equiv="X-UA-Compatible" content="IE=edge">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+        <link rel="stylesheet" href="style.css" />
+        <title>Search Coursera Lectures</title>
+    </head>
+    <body>
+        <div class="extension__container">
+        <header class="header__course"><h1>Coursera Transcript Search</h1></header>
+        <div class="result__container--transcript" id="result-container-transcript"></div>
+        <footer class="footer__input">
+            <input type="text" id="searchbox" placeholder="Search text...">
+            <button id="submit-button">Submit</button>
+        </footer>
+        </div>
+        <script src="js/search.js"></script>
+    </body>
+</html>
+
+
+
diff --git a/ChromeExtension/js/search.js b/ChromeExtension/js/search.js
@@ -0,0 +1,147 @@
+const search_btn = document.getElementById("submit-button");
+const result_container = document.querySelector('#result-container-transcript')
+
+search_btn.addEventListener('click', function () {
+    if (result_container.childElementCount > 0) {
+        remove_all_children(result_container)
+    }
+
+    search_api()
+});
+
+async function search_api() {
+
+    var headers = new Headers();
+    headers.append("Content-Type", "application/json");
+    headers.append("Authorization", "Basic ZWxhc3RpYzpwY2lXY2xwTE5kWHVpY1VoWFY4YmhnazI=");
+
+    const query_txt = document.getElementById("searchbox").value
+    // Query string to send to elasticSearch
+    const query_payload = {
+        size: 5,
+        from: 0,
+        query: {
+            "query_string": {
+                "query": query_txt
+            }
+        }
+    }
+    var requestOptions = {
+        method: 'POST',
+        headers: headers,
+        body: JSON.stringify(query_payload)
+    };
+
+    // Calling ES _search API to retrieve results from "subtitles" API
+    const response = await fetch("https://ac55987c83844faa90726d4e5efe92b9.us-central1.gcp.cloud.es.io/subtitles/_search", requestOptions)
+    const record = await response.json()
+    if(record.hits.total.value > 0) {
+        const result_num = Math.min(record.hits.total.value, 5)
+        for (let i = 0; i < result_num; i++)  {
+            const result = record.hits.hits[i]._source
+            const result_dict = {}
+            const response_str = '<strong>'+ result.week + ' </br> </strong>'
+                + '<strong> Title :: </strong>' + result.lecture_title + '</br>' +
+                '<a href="' + result.url + '">  timestamp </a>:: ' + result.time + '<br/>'
+                 + '<strong> Subtitles </strong> : '+result.text
+                 + '</br>'
+            console.log("Resoponse :: ", response_str)
+            result_dict["week"] = "Week " + result.week.slice(-1)
+            result_dict["lecture_title"] = result.lecture_title
+            result_dict["url"] = result.url
+            result_dict["time"] = result.time
+            result_dict["subtitles"] = result.text
+            result_dict["course_name"] = result.course_name
+            set_result_format(result_dict)
+        }
+    } else {
+        const result_div = document.createElement('div')
+        result_div.innerHTML = "We could not find a related topic"
+        result_container.appendChild(result_div)
+    }
+
+}
+
+function set_result_format(result_dict) {
+
+    // Initiate html components
+    const result_item = document.createElement('div')
+    const result_first_row = document.createElement('div')
+    const result_second_row = document.createElement('div')
+    const result_url = document.createElement('a')
+    const result_week = document.createElement('h4')
+    const result_course_name = document.createElement('h4')
+    const result_time = document.createElement('h4')
+    const result_lecture_title = document.createElement('h4')
+    const result_subtitles = document.createElement('p')
+
+    // Set up class/ id for some components
+    result_item.classList.add("result__item")
+    result_first_row.classList.add("result__first--row")
+    result_second_row.classList.add("result__second--row")
+    result_course_name.classList.add("result__course--name")
+    result_time.classList.add("timestamp")
+    result_url.classList.add("lecture__url")
+
+    // Set the content of components
+    result_url.href = result_dict["url"]
+    result_week.innerHTML = result_dict["week"]
+    result_course_name.innerHTML = result_dict["course_name"]
+    time_reformat = format_time(result_dict["time"])
+    result_time.innerHTML = time_reformat
+    result_lecture_title.innerHTML = result_dict["lecture_title"]
+    result_subtitles.innerHTML = result_dict["subtitles"]
+
+    // Organize html component structure
+    result_item.appendChild(result_url)
+    result_item.appendChild(result_first_row)
+    result_first_row.append(result_week)
+    result_first_row.append(result_course_name)
+    result_item.appendChild(result_second_row)
+    result_second_row.appendChild(result_time)
+    result_second_row.appendChild(result_lecture_title)
+    result_item.appendChild(result_subtitles)
+
+    result_container.appendChild(result_item)
+}
+
+function format_time(time) {
+    let parts = time.split(':').map(part => parseInt(part, 10));
+    let seconds = parts[0];
+    let minutes = parts[1];
+    let hours = parts.length > 2 ? parts[2] : 0;
+
+    // Make sure each part has two digits
+    hours = hours.toString().padStart(2, '0');
+    minutes = minutes.toString().padStart(2, '0');
+    seconds = seconds.toString().padStart(2, '0');
+
+    return `${hours}:${minutes}:${seconds}`;
+}
+
+function remove_all_children(element) {
+    while (element.firstChild) {
+        element.removeChild(element.firstChild);
+    }
+}
+
+document.addEventListener('DOMContentLoaded', function () {
+    const parent = document.querySelector('.result__container--transcript');
+
+    parent.addEventListener('click', function (event) {
+        // Check if the clicked element or its parent has the class 'container'
+        let container = event.target.classList.contains('result__item') 
+            ? event.target 
+            : event.target.closest('.result__item');
+
+        if (container) {
+            // Extract the URL from the child anchor tag
+            let url = container.querySelector('.lecture__url').getAttribute('href');
+
+            // Open the URL
+            if (url) {
+                chrome.tabs.create({ url: url });
+            }
+        }
+    });
+});
diff --git a/ChromeExtension/manifest.json b/ChromeExtension/manifest.json
@@ -0,0 +1,16 @@
+{
+  "name": "CS410_Fall2023_CourseProject_TeamCAHJ",
+  "description": "Base Level Extension",
+  "version": "1.0",
+  "permissions": [
+    "storage",
+    "tabs"
+  ],
+  "host_permissions": ["http://*/*", "https://*/*"],
+  "manifest_version": 3,
+  "action": {
+    "default_popup": "index.html",
+    "default_icon": "img/CS410_Fall2023_CourseProject_TeamCAHJ.png",
+    "default_title": "CS410_Fall2023_CourseProject_TeamCAHJ"
+  }
+}