Web Page Reader

Demonstrates our web page reader.

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

Using SimpleWebPageReader

from llama_index import GPTListIndex, SimpleWebPageReader
from IPython.display import Markdown, display
import os

# NOTE: the html_to_text=True option requires html2text to be installed

documents = SimpleWebPageReader(html_to_text=True).load_data(["http://paulgraham.com/worked.html"])

documents[0]

index = GPTListIndex.from_documents(documents)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

display(Markdown(f"<b>{response}</b>"))

Using TrafilaturaWebReader

from llama_index import TrafilaturaWebReader

documents = TrafilaturaWebReader().load_data(["http://paulgraham.com/worked.html"])

index = GPTListIndex.from_documents(documents)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

display(Markdown(f"<b>{response}</b>"))

Using RssReader

from llama_index import GPTListIndex, RssReader

documents = RssReader().load_data([
    "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
    ])

index = GPTListIndex.from_documents(documents)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What happened in the news today?")