Commit edfc8dc1 authored by F1nnH's avatar F1nnH
Browse files

Add and delete files, move files

parent ff3a6916
Loading
Loading
Loading
Loading
+2034 −2034

File changed and moved.

Preview size limit exceeded, changes collapsed.

+8 −22
Original line number Diff line number Diff line
# Notes

- präsentation schicken, zeitplan schicken, sachen von tafel schicken
- Zeitplan aktualisieren
- Projektmanager festlegen bzw jemadnen der sachen hintefragt
- sachen vorher schicken damit wir feedback bekommen
- Zeitplan besser darstellen
- dialog skripte random generieren konzept ausarbeiten 
- dabei alternative modellieren
- präsentation schicken, zeitplan schicken
- Zeitplan aktualisieren (das was an der tafel war)
- Projektmanager festlegen bzw jemadnen der sachen hintefragt festlegen (Lea)
- sachen allgemein vorher schicken damit wir feedback bekommen
- Sind unsere Slots gleich wie im Multi Datensatz, bzw müssen wir sachen anpassen, wir wollen besser die Slot values vergleichen können
- mapping zu daten aus multiwoz damit wir die selben slots füllen und besser auf die slot accuracies vergleichen können
-  


## Wochenplan 

- komplette Annotation
- Metriken weitermachen
- Prompt tuning
- Modell zum embedden aufsetzen
- natürlichsprachliche sätze aus den infos machen und embedden (SBERT weil vortrainiert für bestimmte anfrage und information wie ähnlich sind die beiden Sätze, einmal mit sbert ganze datenbank embedden und dann mit einer matrixmultiplikation rausfinden was ist am nähesten dran) Dafür ist wichtig wie anfragen aussehen und dass die aus das selbe format runtergebrochen werden wie unsere embeddings, S3BERT wäre auch eine möglichkeit weil wir zu unsern obekten komplexere statements erstellen müssen, bzw beschreibungen die ausführlicher sind, und dann kann man fragen vergleichen und schauen welche informationen sich überlappen bzw ähnlich sind
- natürlichsprachlich zu slots und anderstrum sonst keine Slotmetriken 
- Randomisiertes erstellen aus daten
- wie machen wir anfragen an datenbank 
- Listen wie und was wir machen und dann eine weitere liste mit todos und genauen zeiten plus rollen darauf verteilen
- am besten viele auch kurze meetings und ein logbuch also todos abhaken
- todo listen (auch pro wochen)
 No newline at end of file
- natürlichsprachlich zu slots und anderstrum sonst keine Slotmetriken ??
- zu Todos leute zuteilen und wenn was geschafft immer abhaken + alles pushen
- Todo aus Tafel ableiten und in file eintragen
- Modell zum embedden aufsetzen
+8 −1
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# HeidelKBerg

A knowledge base for retrieval-assisted generation of task-oriented dialogues.

%% Cell type:markdown id: tags:

## Data

We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query:

```sql
[out:json][timeout:25];
{{geocodeArea:Heidelberg}}->.searchArea;
(
  nwr["amenity"="restaurant"](area.searchArea);
  nwr["amenity"="museum"](area.searchArea);
  nwr["tourism"="hotel"](area.searchArea);
  nwr["tourism"="attraction"](area.searchArea);
);
out geom;
```

and export it to `@/data/osm/osm_export.geojson`.

%% Cell type:markdown id: tags:

## Schema

We define the schema of our knowledge base with pydantic types.

%% Cell type:code id: tags:

``` python
from typing import List, Literal
from pydantic import BaseModel, computed_field


class Address(BaseModel):
    street: str
    housenumber: str
    # suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html
    suburb: Literal[
        "Altstadt",
        "Bahnstadt",
        "Bergheim",
        "Boxberg",
        "Emmertsgrund",
        "Handschuhsheim",
        "Kirchheim",
        "Neuenheim",
        "Pfaffengrund",
        "Rohrbach",
        "Schlierbach",
        "Südstadt",
        "Weststadt",
        "Wieblingen",
        "Ziegelhausen"
    ]

    # Since all types that have an address have an area, we compute the area
    # on address level to minimize code duplicity, although it technically is
    # no address property
    @computed_field
    @property
    def area(self) -> Literal["center", "east", "north", "south", "west"]:
        area = {
            "Altstadt": "center",
            "Bahnstadt": "center",
            "Bergheim": "center",
            "Boxberg": "south",
            "Emmertsgrund": "south",
            "Handschuhsheim": "north",
            "Kirchheim": "south",
            "Neuenheim": "north",
            "Pfaffengrund": "west",
            "Rohrbach": "south",
            "Schlierbach": "east",
            "Südstadt": "south",
            "Weststadt": "center",
            "Wieblingen": "west",
            "Ziegelhausen": "east"
        }
        return area[self.suburb]


class Venue(BaseModel):
    name: str
    description: str
    address: Address


class Hotel(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    stars: Literal["0", "1", "2", "3", "4", "5", "unknown"]
    air_conditioning: Literal["yes", "no", "unknown"]
    internet_access: Literal["yes", "no", "unknown"]
    parking: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "unknown"]


class Cuisine(BaseModel):
    african: bool,
    american: bool,
    asian: bool,
    chinese: bool,
    german: bool,
    greek: bool,
    indian: bool,
    international: bool,
    italian: bool,
    japanese: bool,
    korean: bool,
    oriental: bool,
    sushi: bool,
    thai: bool,


class Restaurant(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    internet_access: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "limited" "unknown"]
    bar: Literal["yes", "no", "unknown"]
    delivery: Literal["yes", "no", "unknown"]
    vegan: Literal["yes", "no", "limited", "unknown"]
    vegetarian: Literal["yes", "no", "limited", "unknown"]
    indoor_seating: Literal["yes", "no", "limited", "unknown"]
    outdoor_seating: Literal["yes", "no", "limited", "unknown"]
    cuisine: Cuisine


class Attraction(Venue):
    attraction_type: List[Literal[
        "zoo",
        "architecture",
        "view",
        "historic",
    ]]


#class Museum(Attraction):
#    theme: str


#class Theatre(Attraction):
#    program: List[Literal[
#                "Monday",
#                "Tuesday",
#                "Wednesday",
#                "Thursday",
#                "Friday",
#                "Saturday",
#                "Sunday",
#    ]]


#class TouristAttraction(Attraction):
#    view: Literal["yes", "no", "unknown"]
```

%% Output

      Cell In[3], line 72
        african: bool,
                     ^
    SyntaxError: invalid syntax

%% Cell type:markdown id: tags:

We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing.
We are using three output files to easier split the data for three manual annotators.

%% Cell type:code id: tags:

``` python
import json
from collections import defaultdict


restaurants = []
hotels = []
attractions = []

# Load data and split into types
with open("../data/osm/osm_export.geojson") as file:
    data = json.load(file)
    k = defaultdict(int)
    for node in data["features"]:
        node = node["properties"]
        if "amenity" in node.keys():
            if node["amenity"] == "restaurant":
                restaurants.append(node)
        if "tourism" in node.keys():
            if node["tourism"] == "hotel":
                hotels.append(node)
            if node["tourism"] == "attraction":
                attractions.append(node)


def write_data(label: str, data: list):
    """
    Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys.
    It then writes the data split into thre json files.

        @input label: str
        @input data: list

        @output void
    """

    # Define possible labels
    data_keys = [
        "name",
        "description",
        "addr:street",
        "street",
        "addr:housenumber",
        "housenumber",
        "addr:suburb",
        "suburb",
        "phone",
        "contact:phone",
        "website",
        "contact:website",
        "internet_access",
        "smoking",
        "wheelchair",
        "delivery",
        "vegan",
        "diet:vegan",
        "vegetarian",
        "diet:vegetarian",
        "indoor_seating",
        "outdoor_seating",
        "cuisine",
        "price",
        "type",
        "edited",
    ]


    # Prepare to split output data to 3 files
    of0 = []
    of1 = []
    of2 = []

    # Iterating over data points
    i = 0
    for data_point in data:
        rk = list(data_point.keys())

        # Filter out the unwanted properties
        for key in rk:
            if key not in data_keys:
                data_point.pop(key)


        # Homogenize labels
        # Vegan/Vegetarion
        if "diet:vegan" in rk:
            data_point["vegan"] = data_point["diet:vegan"]
            data_point.pop("diet:vegan")
        if "diet:vegetarian" in rk:
            data_point["vegetarian"] = data_point["diet:vegetarian"]
            data_point.pop("diet:vegetarian")

        # Contact
        if "contact:phone" in rk:
            data_point["phone"] = data_point["contact:phone"]
            data_point.pop("contact:phone")
        if "contact:website" in rk:
            data_point["phone"] = data_point["contact:website"]
            data_point.pop("contact:website")

        # Address
        if "addr:street" in rk:
            data_point["street"] = data_point["addr:street"]
            data_point.pop("addr:street")
        if "addr:housenumber" in rk:
            data_point["housenumber"] = data_point["addr:housenumber"]
            data_point.pop("addr:housenumber")
        if "addr:suburb" in rk:
            data_point["suburb"] = data_point["addr:suburb"]
            data_point.pop("addr:suburb")


        # Cuisine
        default_cuisine = {
            "african": False,
            "american": False,
            "asian": False,
            "chinese": False,
            "german": False,
            "greek": False,
            "indian": False,
            "international": False,
            "italian": False,
            "japanese": False,
            "korean": False,
            "oriental": False,
            "sushi": False,
            "thai": False,
        }
        if label == "restaurants":
            new_cuisine = default_cuisine
            if "cuisine" in data_point.keys():
                for cuisine in data_point["cuisine"].split(";"):
                    if cuisine in new_cuisine.keys():
                        new_cuisine[cuisine] = True
            data_point["cuisine"] = new_cuisine


        superflous_labels = [
            "diet:vegan",
            "diet:vegetarian",
            "contact:phone",
            "contact:website",
            "addr:housenumber",
            "addr:street",
            "addr:suburb"
        ]

        # Put a "unknown" marker inside empty properties
        for key in data_keys:
            if key not in data_point.keys():
                if key in superflous_labels:
                    continue
                elif key == "type":
                    data_point["type"] = label.rstrip("s")
                elif key == "edited":
                    data_point["edited"] = False
                else:
                    data_point[key] = "unknown"


        if i == 0:
            of0.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
        if i == 1:
            of1.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
        if i == 2:
            of2.append(dict(sorted(data_point.items(), key=lambda item: item[0])))

        i += 1
        i %= 3



    o0 = {"data": of0}
    o1 = {"data": of1}
    o2 = {"data": of2}

    output0 = open(f"../data/osm/{label}_1.json", "w")
    output1 = open(f"../data/osm/{label}_2.json", "w")
    output2 = open(f"../data/osm/{label}_3.json", "w")

    output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
    output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
    output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

    output0.close()
    output1.close()
    output2.close()

# Write preprocessed data to output files
write_data("hotels", hotels)
write_data("restaurants", restaurants)
write_data("attractions", attractions)
```

%% Cell type:markdown id: tags:

### Find Suburbs
We are using the official city map of Heidelberg to get the suburb for a specific address, the map can be found here: https://geoweb.heidelberg.de/geoportal/.

The data we extracted from OpenStreetMap is used to get the address for each datapoint. Then we make a request to the heidelberg.de website and find the suburb for that address. After that we fill the suburb value in our json file.

%% Cell type:code id: tags:

``` python
# Imports

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
import json
```

%% Cell type:code id: tags:

``` python
def find_suburb_for_address(address):

    driver = webdriver.Chrome(ChromeDriverManager().install())

    # Open the website
    driver.get('https://geoweb.heidelberg.de/geoportal/')

    # Wait for the page to load
    time.sleep(6)

    # Activate the checkbox for "Stadtteile"
    checkbox_button = driver.find_element(By.XPATH, "//tr[@data-qtip='Stadtteile']//input[@type='button']")
    checkbox_button.click()

    # Wait a bit
    time.sleep(2)

    # Find the search bar, clear it, and enter an address
    search_bar = driver.find_element(By.ID, 'searchserviceheidelberg-1060-inputEl')
    search_bar.clear()
    search_bar.send_keys(address)

    # Wait for the list
    time.sleep(2)

    # Click on the first item in the list (hopefully ther is just one)
    first_item = driver.find_element(By.CSS_SELECTOR, '#boundlist-1121-listEl ul li')
    first_item.click()

    # Wait for results to load
    time.sleep(2)

    # Click on the center of the screen
    window_size = driver.get_window_size()
    center_x = window_size['width'] // 2
    center_y = window_size['height'] // 2
    ActionChains(driver).move_by_offset(center_x, center_y).click().perform()

    # Wait for the popup bubble
    time.sleep(4)

    # Find the element containing the suburb name and extract its text
    suburbs = [
    "Altstadt", "Bahnstadt", "Bergheim", "Boxberg", "Emmertsgrund",
    "Handschuhsheim", "Kirchheim", "Neuenheim", "Pfaffengrund", "Rohrbach",
    "Schlierbach", "Südstadt", "Weststadt", "Wieblingen", "Ziegelhausen"
    ]

    found_suburb = None
    for suburb in suburbs:
        suburb_elements = driver.find_elements(By.XPATH, f"//div[contains(@class, 'x-grid-cell-inner') and contains(text(), '{suburb}')]")
        if suburb_elements:
            found_suburb = suburb
            break

    if found_suburb:
        pass
    else:
        found_suburb = "unknown"


    # Close the browser
    driver.quit()
    return found_suburb
```

%% Cell type:code id: tags:

``` python
# Read the JSON file
with open('restaurants_1.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Iterate over each entry and find the suburb
for entry in data["data"][:5]:
for entry in data["data"][:20]:
    address = f"{entry['addr:housenumber']} {entry['addr:street']}"
    suburb = find_suburb_for_address(address)
    entry['addr:suburb'] = suburb

# Save the updated JSON data
with open('test.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)
```
Loading