Commit 09570e4c authored by pracht's avatar pracht
Browse files

Fix schema

parent edfc8dc1
Loading
Loading
Loading
Loading
+14 −14
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# HeidelKBerg

A knowledge base for retrieval-assisted generation of task-oriented dialogues.

%% Cell type:markdown id: tags:

## Data

We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query:

```sql
[out:json][timeout:25];
{{geocodeArea:Heidelberg}}->.searchArea;
(
  nwr["amenity"="restaurant"](area.searchArea);
  nwr["amenity"="museum"](area.searchArea);
  nwr["tourism"="hotel"](area.searchArea);
  nwr["tourism"="attraction"](area.searchArea);
);
out geom;
```

and export it to `@/data/osm/osm_export.geojson`.

%% Cell type:markdown id: tags:

## Schema

We define the schema of our knowledge base with pydantic types.

%% Cell type:code id: tags:

``` python
from typing import List, Literal
from pydantic import BaseModel, computed_field


class Address(BaseModel):
    street: str
    housenumber: str
    # suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html
    suburb: Literal[
        "Altstadt",
        "Bahnstadt",
        "Bergheim",
        "Boxberg",
        "Emmertsgrund",
        "Handschuhsheim",
        "Kirchheim",
        "Neuenheim",
        "Pfaffengrund",
        "Rohrbach",
        "Schlierbach",
        "Südstadt",
        "Weststadt",
        "Wieblingen",
        "Ziegelhausen"
    ]

    # Since all types that have an address have an area, we compute the area
    # on address level to minimize code duplicity, although it technically is
    # no address property
    @computed_field
    @property
    def area(self) -> Literal["center", "east", "north", "south", "west"]:
        area = {
            "Altstadt": "center",
            "Bahnstadt": "center",
            "Bergheim": "center",
            "Boxberg": "south",
            "Emmertsgrund": "south",
            "Handschuhsheim": "north",
            "Kirchheim": "south",
            "Neuenheim": "north",
            "Pfaffengrund": "west",
            "Rohrbach": "south",
            "Schlierbach": "east",
            "Südstadt": "south",
            "Weststadt": "center",
            "Wieblingen": "west",
            "Ziegelhausen": "east"
        }
        return area[self.suburb]


class Venue(BaseModel):
    name: str
    description: str
    address: Address


class Hotel(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    stars: Literal["0", "1", "2", "3", "4", "5", "unknown"]
    air_conditioning: Literal["yes", "no", "unknown"]
    internet_access: Literal["yes", "no", "unknown"]
    parking: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "unknown"]


class Cuisine(BaseModel):
    african: bool,
    american: bool,
    asian: bool,
    chinese: bool,
    german: bool,
    greek: bool,
    indian: bool,
    international: bool,
    italian: bool,
    japanese: bool,
    korean: bool,
    oriental: bool,
    sushi: bool,
    thai: bool,
    "african": bool
    "american": bool
    "asian": bool
    "chinese": bool
    "german": bool
    "greek": bool
    "indian": bool
    "international": bool
    "italian": bool
    "japanese": bool
    "korean": bool
    "oriental": bool
    "sushi": bool
    "thai": bool


class Restaurant(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    internet_access: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "limited" "unknown"]
    bar: Literal["yes", "no", "unknown"]
    delivery: Literal["yes", "no", "unknown"]
    vegan: Literal["yes", "no", "limited", "unknown"]
    vegetarian: Literal["yes", "no", "limited", "unknown"]
    indoor_seating: Literal["yes", "no", "limited", "unknown"]
    outdoor_seating: Literal["yes", "no", "limited", "unknown"]
    cuisine: Cuisine


class Attraction(Venue):
    attraction_type: List[Literal[
        "zoo",
        "architecture",
        "view",
        "historic",
    ]]


#class Museum(Attraction):
#    theme: str


#class Theatre(Attraction):
#    program: List[Literal[
#                "Monday",
#                "Tuesday",
#                "Wednesday",
#                "Thursday",
#                "Friday",
#                "Saturday",
#                "Sunday",
#    ]]


#class TouristAttraction(Attraction):
#    view: Literal["yes", "no", "unknown"]
```

%% Output

      Cell In[3], line 72
        african: bool,
                     ^
    SyntaxError: invalid syntax

%% Cell type:markdown id: tags:

We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing.
We are using three output files to easier split the data for three manual annotators.

%% Cell type:code id: tags:

``` python
import json
from collections import defaultdict


restaurants = []
hotels = []
attractions = []

# Load data and split into types
with open("../data/osm/osm_export.geojson") as file:
    data = json.load(file)
    k = defaultdict(int)
    for node in data["features"]:
        node = node["properties"]
        if "amenity" in node.keys():
            if node["amenity"] == "restaurant":
                restaurants.append(node)
        if "tourism" in node.keys():
            if node["tourism"] == "hotel":
                hotels.append(node)
            if node["tourism"] == "attraction":
                attractions.append(node)


def write_data(label: str, data: list):
    """
    Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys.
    It then writes the data split into thre json files.

        @input label: str
        @input data: list

        @output void
    """

    # Define possible labels
    data_keys = [
        "name",
        "description",
        "addr:street",
        "street",
        "addr:housenumber",
        "housenumber",
        "addr:suburb",
        "suburb",
        "phone",
        "contact:phone",
        "website",
        "contact:website",
        "internet_access",
        "smoking",
        "wheelchair",
        "delivery",
        "vegan",
        "diet:vegan",
        "vegetarian",
        "diet:vegetarian",
        "indoor_seating",
        "outdoor_seating",
        "cuisine",
        "price",
        "type",
        "edited",
    ]


    # Prepare to split output data to 3 files
    of0 = []
    of1 = []
    of2 = []

    # Iterating over data points
    i = 0
    for data_point in data:
        rk = list(data_point.keys())

        # Filter out the unwanted properties
        for key in rk:
            if key not in data_keys:
                data_point.pop(key)


        # Homogenize labels
        # Vegan/Vegetarion
        if "diet:vegan" in rk:
            data_point["vegan"] = data_point["diet:vegan"]
            data_point.pop("diet:vegan")
        if "diet:vegetarian" in rk:
            data_point["vegetarian"] = data_point["diet:vegetarian"]
            data_point.pop("diet:vegetarian")

        # Contact
        if "contact:phone" in rk:
            data_point["phone"] = data_point["contact:phone"]
            data_point.pop("contact:phone")
        if "contact:website" in rk:
            data_point["phone"] = data_point["contact:website"]
            data_point.pop("contact:website")

        # Address
        if "addr:street" in rk:
            data_point["street"] = data_point["addr:street"]
            data_point.pop("addr:street")
        if "addr:housenumber" in rk:
            data_point["housenumber"] = data_point["addr:housenumber"]
            data_point.pop("addr:housenumber")
        if "addr:suburb" in rk:
            data_point["suburb"] = data_point["addr:suburb"]
            data_point.pop("addr:suburb")


        # Cuisine
        default_cuisine = {
            "african": False,
            "american": False,
            "asian": False,
            "chinese": False,
            "german": False,
            "greek": False,
            "indian": False,
            "international": False,
            "italian": False,
            "japanese": False,
            "korean": False,
            "oriental": False,
            "sushi": False,
            "thai": False,
        }
        if label == "restaurants":
            new_cuisine = default_cuisine
            if "cuisine" in data_point.keys():
                for cuisine in data_point["cuisine"].split(";"):
                    if cuisine in new_cuisine.keys():
                        new_cuisine[cuisine] = True
            data_point["cuisine"] = new_cuisine


        superflous_labels = [
            "diet:vegan",
            "diet:vegetarian",
            "contact:phone",
            "contact:website",
            "addr:housenumber",
            "addr:street",
            "addr:suburb"
        ]

        # Put a "unknown" marker inside empty properties
        for key in data_keys:
            if key not in data_point.keys():
                if key in superflous_labels:
                    continue
                elif key == "type":
                    data_point["type"] = label.rstrip("s")
                elif key == "edited":
                    data_point["edited"] = False
                else:
                    data_point[key] = "unknown"


        if i == 0:
            of0.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
        if i == 1:
            of1.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
        if i == 2:
            of2.append(dict(sorted(data_point.items(), key=lambda item: item[0])))

        i += 1
        i %= 3



    o0 = {"data": of0}
    o1 = {"data": of1}
    o2 = {"data": of2}

    output0 = open(f"../data/osm/{label}_1.json", "w")
    output1 = open(f"../data/osm/{label}_2.json", "w")
    output2 = open(f"../data/osm/{label}_3.json", "w")

    output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
    output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
    output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

    output0.close()
    output1.close()
    output2.close()

# Write preprocessed data to output files
write_data("hotels", hotels)
write_data("restaurants", restaurants)
write_data("attractions", attractions)
```

%% Cell type:markdown id: tags:

### Find Suburbs
We are using the official city map of Heidelberg to get the suburb for a specific address, the map can be found here: https://geoweb.heidelberg.de/geoportal/.

The data we extracted from OpenStreetMap is used to get the address for each datapoint. Then we make a request to the heidelberg.de website and find the suburb for that address. After that we fill the suburb value in our json file.

%% Cell type:code id: tags:

``` python
# Imports

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
import json
```

%% Cell type:code id: tags:

``` python
def find_suburb_for_address(address):

    driver = webdriver.Chrome(ChromeDriverManager().install())

    # Open the website
    driver.get('https://geoweb.heidelberg.de/geoportal/')

    # Wait for the page to load
    time.sleep(6)

    # Activate the checkbox for "Stadtteile"
    checkbox_button = driver.find_element(By.XPATH, "//tr[@data-qtip='Stadtteile']//input[@type='button']")
    checkbox_button.click()

    # Wait a bit
    time.sleep(2)

    # Find the search bar, clear it, and enter an address
    search_bar = driver.find_element(By.ID, 'searchserviceheidelberg-1060-inputEl')
    search_bar.clear()
    search_bar.send_keys(address)

    # Wait for the list
    time.sleep(2)

    # Click on the first item in the list (hopefully ther is just one)
    first_item = driver.find_element(By.CSS_SELECTOR, '#boundlist-1121-listEl ul li')
    first_item.click()

    # Wait for results to load
    time.sleep(2)

    # Click on the center of the screen
    window_size = driver.get_window_size()
    center_x = window_size['width'] // 2
    center_y = window_size['height'] // 2
    ActionChains(driver).move_by_offset(center_x, center_y).click().perform()

    # Wait for the popup bubble
    time.sleep(4)

    # Find the element containing the suburb name and extract its text
    suburbs = [
    "Altstadt", "Bahnstadt", "Bergheim", "Boxberg", "Emmertsgrund",
    "Handschuhsheim", "Kirchheim", "Neuenheim", "Pfaffengrund", "Rohrbach",
    "Schlierbach", "Südstadt", "Weststadt", "Wieblingen", "Ziegelhausen"
    ]

    found_suburb = None
    for suburb in suburbs:
        suburb_elements = driver.find_elements(By.XPATH, f"//div[contains(@class, 'x-grid-cell-inner') and contains(text(), '{suburb}')]")
        if suburb_elements:
            found_suburb = suburb
            break

    if found_suburb:
        pass
    else:
        found_suburb = "unknown"


    # Close the browser
    driver.quit()
    return found_suburb
```

%% Cell type:code id: tags:

``` python
# Read the JSON file
with open('restaurants_1.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Iterate over each entry and find the suburb
for entry in data["data"][:20]:
    address = f"{entry['addr:housenumber']} {entry['addr:street']}"
    suburb = find_suburb_for_address(address)
    entry['addr:suburb'] = suburb

# Save the updated JSON data
with open('test.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)
```