HW1 | Notion

from elasticsearch import Elasticsearch
import requests
import tiktoken

# ✅ 連線 Elasticsearch
es = Elasticsearch("<http://localhost:9200>", verify_certs=False)

# ✅ 下載 FAQ json
docs_url = '<https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1>'
resp = requests.get(docs_url)
documents_raw = resp.json()

# ✅ 展開資料
documents = []
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

# ✅ 刪除舊索引
index_name = "faq"
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# ✅ 建立新的 index
es.indices.create(
    index=index_name,
    mappings={
        "properties": {
            "course": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"}
        }
    }
)

# ✅ 加入資料
for i, doc in enumerate(documents):
    es.index(index=index_name, id=i, document=doc)

# ✅ Q3: 搜尋
q3_query = {
    "query": {
        "multi_match": {
            "query": "How do execute a command on a Kubernetes pod?",
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    },
    "size": 1
}
q3_res = es.search(index=index_name, query=q3_query["query"], size=1)
q3_score = q3_res['hits']['hits'][0]['_score']
print("Q3 答案：最高分數是", round(q3_score, 2))

# ✅ Q4: 濾 course 並查詢
q4_query = {
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": "How do copy a file to a Docker container?",
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {"course": "machine-learning-zoomcamp"}
            }
        }
    },
    "size": 3
}
q4_res = es.search(index=index_name, query=q4_query["query"], size=3)

print("Q4 答案：前三筆問題分別是：")
for i, hit in enumerate(q4_res['hits']['hits'], 1):
    print(f"{i}. {hit['_source']['question']}")

# ✅ Q5: 建 Prompt
context_template = """
Q: {question}
A: {text}
""".strip()

context_entries = []
for hit in q4_res['hits']['hits']:
    q = hit['_source']['question']
    a = hit['_source']['text']
    context_entries.append(context_template.format(question=q, text=a))

context = "\\n\\n".join(context_entries)

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

final_prompt = prompt_template.format(
    question="How do copy a file to a Docker container?",
    context=context
)
print("Q5 答案：prompt 長度是", len(final_prompt))

# ✅ Q6: 計算 token
encoding = tiktoken.encoding_for_model("gpt-4o")
tokens = encoding.encode(final_prompt)
print("Q6 答案：token 數量是", len(tokens))