Move load save example from the bug bash to the main branch. (#3360)

# Description Add the notebook demonstrating loading and saving of the evaluators to the main branch; add demonstration of `get` and `list` methods. # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [x] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [x] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [x] Pull request includes test coverage for the included changes.
microsoft · May 30, 2024 · e9de990 · e9de990
1 parent 12c87db
commit e9de990
Show file tree

Hide file tree

Showing 18 changed files with 1,476 additions and 0 deletions.
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/Load_saved_evaluator.ipynb b/src/promptflow-evals/samples/LoadSaveEvals/Load_saved_evaluator.ipynb
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/answer.py b/src/promptflow-evals/samples/LoadSaveEvals/answer.py
@@ -0,0 +1,2 @@
+def answer_len(answer):
+    return len(answer)
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/apology.prompty b/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/apology.prompty
@@ -0,0 +1,37 @@
+---
+name: basic evaluate 
+description: basic evaluator for QA scenario
+model:
+  api: chat
+  configuration:
+    type: azure_openai
+    azure_deployment: gpt-35-turbo-1106
+    api_key: ${env:AZURE_OPENAI_API_KEY}
+    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
+  parameters:
+    temperature: 0.2
+    max_tokens: 200
+    top_p: 1.0
+    response_format:
+      type: json_object
+
+inputs: 
+  answer:
+    type: string
+outputs:
+  score:
+    type: string
+---
+system:
+You are an AI assistant. 
+You task is to answer if answer contains an apology. If Answer contains apology, return 1, otherwise return 0.
+The output should be valid JSON.
+
+**Example**
+answer: "Sorry, I can only truth questions related to outdoor/camping gear and equipment"
+output:
+{"score": "1"}
+
+user: 
+answer: {{answer}}
+output:
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/sample.json b/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/sample.json
@@ -0,0 +1,4 @@
+{
+  "question": "what's the capital of China?",
+  "answer": "Shanghai"
+}
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/apology.py b/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/apology.py
@@ -0,0 +1,7 @@
+import re
+from promptflow.core import tool
+
+
+@tool
+def apology(answer):
+    return len(re.findall('(sorry)|(apology)|(apologies)', answer.lower()))
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/flow.dag.yaml b/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/flow.dag.yaml
@@ -0,0 +1,17 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
+inputs:
+  answer:
+    type: string
+outputs:
+  answer:
+    type: string
+    reference: ${apology.output}
+nodes:
+- name: apology
+  type: python
+  source:
+    type: code
+    path: apology.py
+  inputs:
+    answer: ${inputs.answer}
+
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/evaluation_dataset_context.jsonl b/src/promptflow-evals/samples/LoadSaveEvals/evaluation_dataset_context.jsonl
@@ -0,0 +1,4 @@
+{"question": "Which tent is the most waterproof?", "answer": "The TrailMaster X4 tent has the highest rainfly waterproof rating of the available tents, at 2000m", "context": "#TrailMaster X4 Tent, price $250,## BrandOutdoorLiving## CategoryTents## Features- Polyester material for durability- Spacious interior to accommodate multiple people- Easy setup with included instructions- Water-resistant construction to withstand light rain- Mesh panels for ventilation and insect protection- Rainfly included for added weather protection- Multiple doors for convenient entry and exit- Interior pockets for organizing small ite- Reflective guy lines for improved visibility at night- Freestanding design for easy setup and relocation- Carry bag included for convenient storage and transportatio## Technical Specs**Best Use**: Camping  **Capacity**: 4-person  **Season Rating**: 3-season  **Setup**: Freestanding  **Material**: Polyester  **Waterproof**: Yes  **Rainfly**: Included  **Rainfly Waterproof Rating**: 2000mm", "ground_truth": "#TrailMaster X4 Tent"}
+{"question": "Which camping table is the lightest?", "answer": "The BaseCamp Folding Table is the lightest of all of the other camping tables mentioned", "context": "#BaseCamp Folding Table, price $60,## BrandCampBuddy## CategoryCamping Tables## FeaturesLightweight and durable aluminum constructionFoldable design with a compact size for easy storage and transport## Technical Specifications- **Weight**: 15 lbs- **Maximum Weight Capacity**: Up to a certain weight limit (specific weight limit not provided)", "ground_truth": "I cannot say based on the information provided."}
+{"question": "How much does TrailWalker Hiking Shoes cost? ", "answer": "$110", "context": "#TrailWalker Hiking Shoes, price $110## BrandTrekReady## CategoryHiking Footwear", "ground_truth": "$110"}
+{"question": "Is France in Europe?", "answer": "Sorry, I can only truth questions related to outdoor/camping gear and equipment", "context": "#TrailWalker Hiking Shoes, price $110## BrandTrekReady## CategoryHiking Footwear", "ground_truth": "Yes"}
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/flex_flow/answer.py b/src/promptflow-evals/samples/LoadSaveEvals/flex_flow/answer.py
@@ -0,0 +1,2 @@
+def answer_len(answer):
+    return len(answer)
diff --git a/src/promptflow-evals/samples/LoadSaveEvals/flex_flow/flow.flex.yaml b/src/promptflow-evals/samples/LoadSaveEvals/flex_flow/flow.flex.yaml
@@ -0,0 +1,4 @@
+inputs:
+  answer:
+    type: object
+entry: answer:answer_len
diff --git a/src/promptflow-evals/samples/evaluate-target/askwiki/askwiki.py b/src/promptflow-evals/samples/evaluate-target/askwiki/askwiki.py
@@ -0,0 +1,178 @@
+import os
+import pathlib
+import random
+import time
+from functools import partial
+
+import jinja2
+import requests
+import bs4
+import re
+from concurrent.futures import ThreadPoolExecutor
+from openai import AzureOpenAI
+
+
+session = requests.Session()
+
+templateLoader = jinja2.FileSystemLoader(pathlib.Path(__file__).parent.resolve())
+templateEnv = jinja2.Environment(loader=templateLoader)
+system_message_template = templateEnv.get_template("system-message.jinja2")
+
+
+def decode_str(string):
+    return string.encode().decode("unicode-escape").encode("latin1").decode("utf-8")
+
+
+def remove_nested_parentheses(string):
+    pattern = r'\([^()]+\)'
+    while re.search(pattern, string):
+        string = re.sub(pattern, '', string)
+    return string
+
+
+def get_page_sentence(page, count: int = 10):
+    # find all paragraphs
+    paragraphs = page.split("\n")
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+
+    # find all sentence
+    sentences = []
+    for p in paragraphs:
+        sentences += p.split('. ')
+    sentences = [s.strip() + '.' for s in sentences if s.strip()]
+    # get first `count` number of sentences
+    return ' '.join(sentences[:count])
+
+
+def fetch_text_content_from_url(url: str, count: int = 10):
+    # Send a request to the URL
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                          "Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"
+        }
+        delay = random.uniform(0, 0.5)
+        time.sleep(delay)
+        response = session.get(url, headers=headers)
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = bs4.BeautifulSoup(response.text, 'html.parser')
+            page_content = [p_ul.get_text().strip() for p_ul in soup.find_all("p") + soup.find_all("ul")]
+            page = ""
+            for content in page_content:
+                if len(content.split(" ")) > 2:
+                    page += decode_str(content)
+                if not content.endswith("\n"):
+                    page += "\n"
+            text = get_page_sentence(page, count=count)
+            return (url, text)
+        else:
+            msg = f"Get url failed with status code {response.status_code}.\nURL: {url}\nResponse: " \
+                  f"{response.text[:100]}"
+            print(msg)
+            return (url, "No available content")
+
+    except Exception as e:
+        print("Get url failed with error: {}".format(e))
+        return (url, "No available content")
+
+
+def search_result_from_url(url_list: list, count: int = 10):
+    results = []
+    partial_func_of_fetch_text_content_from_url = partial(fetch_text_content_from_url, count=count)
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = executor.map(partial_func_of_fetch_text_content_from_url, url_list)
+        for feature in futures:
+            results.append(feature)
+    return results
+
+
+def get_wiki_url(entity: str, count=2):
+    # Send a request to the URL
+    url = f"https://en.wikipedia.org/w/index.php?search={entity}"
+    url_list = []
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                          "Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"}
+        response = requests.get(url, headers=headers)
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = bs4.BeautifulSoup(response.text, 'html.parser')
+            mw_divs = soup.find_all("div", {"class": "mw-search-result-heading"})
+            if mw_divs:  # mismatch
+                result_titles = [decode_str(div.get_text().strip()) for div in mw_divs]
+                result_titles = [remove_nested_parentheses(result_title) for result_title in result_titles]
+                # print(f"Could not find {entity}. Similar entity: {result_titles[:count]}.")
+                url_list.extend([f"https://en.wikipedia.org/w/index.php?search={result_title}" for result_title in
+                                 result_titles])
+            else:
+                page_content = [p_ul.get_text().strip() for p_ul in soup.find_all("p") + soup.find_all("ul")]
+                if any("may refer to:" in p for p in page_content):
+                    url_list.extend(get_wiki_url("[" + entity + "]"))
+                else:
+                    url_list.append(url)
+        else:
+            msg = f"Get url failed with status code {response.status_code}.\nURL: {url}\nResponse: " \
+                  f"{response.text[:100]}"
+            print(msg)
+        return url_list[:count]
+    except Exception as e:
+        print("Get url failed with error: {}".format(e))
+        return url_list
+
+
+def process_search_result(search_result):
+    def format(doc: dict):
+        return f"Content: {doc['Content']}"
+
+    try:
+        context = []
+        for url, content in search_result:
+            context.append({
+                "Content": content,
+                # "Source": url
+            })
+        context_str = "\n\n".join([format(c) for c in context])
+        return context_str
+    except Exception as e:
+        print(f"Error: {e}")
+        return ""
+
+
+def augmented_qa(question, context):
+    system_message = system_message_template.render(contexts=context)
+
+    messages = [
+        {"role": "system", "content": system_message},
+        {"role": "user", "content": question}
+    ]
+
+    with AzureOpenAI(
+        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+        api_key=os.environ["AZURE_OPENAI_API_KEY"],
+        api_version=os.environ["AZURE_OPENAI_API_VERSION"]
+    ) as client:
+        response = client.chat.completions.create(
+            model=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
+            messages=messages, temperature=0.7,
+            max_tokens=800
+        )
+
+        return response.choices[0].message.content
+
+
+def ask_wiki(question):
+    url_list = get_wiki_url(question, count=2)
+    search_result = search_result_from_url(url_list, count=10)
+    context = process_search_result(search_result)
+    answer = augmented_qa(question, context)
+
+    return {
+        "answer": answer,
+        "context": str(context)
+    }
+
+
+if __name__ == "__main__":
+    print(ask_wiki("Who is the president of the United States?"))
diff --git a/src/promptflow-evals/samples/evaluate-target/askwiki/system-message.jinja2 b/src/promptflow-evals/samples/evaluate-target/askwiki/system-message.jinja2
@@ -0,0 +1,5 @@
+You are a chatbot having a conversation with a human.
+Given the following extracted parts of a long document and a question, create a final answer.
+If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+
+{{contexts}}
diff --git a/src/promptflow-evals/samples/evaluate-target/data/data.jsonl b/src/promptflow-evals/samples/evaluate-target/data/data.jsonl
@@ -0,0 +1,3 @@
+{"question":"When was United Stated found ?", "ground_truth":"1776"}
+{"question":"What is the capital of France?", "ground_truth":"Paris"}
+{"question":"Who is the best tennis player of all time ?", "ground_truth":"Roger Federer"}