Commit 51517272 authored by pracht's avatar pracht
Browse files

Multiline comment got printed in output

parent 817714cc
Loading
Loading
Loading
Loading
+15 −16
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# HeidelKBerg

A knowledge base for retrieval-assisted generation of task-oriented dialogues.

%% Cell type:markdown id: tags:

## Data

We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query:

```sql
[out:json][timeout:25];
{{geocodeArea:Heidelberg}}->.searchArea;
(
  nwr["amenity"="restaurant"](area.searchArea);
  nwr["amenity"="museum"](area.searchArea);
  nwr["tourism"="hotel"](area.searchArea);
  nwr["tourism"="attraction"](area.searchArea);
);
out geom;
```

and export it to `@/data/osm/osm_export.geojson`.

%% Cell type:markdown id: tags:

## Schema

We define the schema of our knowledge base with pydantic types.

%% Cell type:code id: tags:

``` python
from typing import List, Literal
from pydantic import BaseModel, computed_field


class Address(BaseModel):
    street: str
    housenumber: str
    # suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html
    suburb: Literal[
        "Altstadt",
        "Bahnstadt",
        "Bergheim",
        "Boxberg",
        "Emmertsgrund",
        "Handschuhsheim",
        "Kirchheim",
        "Neuenheim",
        "Pfaffengrund",
        "Rohrbach",
        "Schlierbach",
        "Südstadt",
        "Weststadt",
        "Wieblingen",
        "Ziegelhausen"
    ]

    # Since all types that have an address have an area, we compute the area
    # on address level to minimize code duplicity, although it technically is
    # no address property
    @computed_field
    @property
    def area(self) -> Literal["center", "east", "north", "south", "west"]:
        area = {
            "Altstadt": "center",
            "Bahnstadt": "center",
            "Bergheim": "center",
            "Boxberg": "south",
            "Emmertsgrund": "south",
            "Handschuhsheim": "north",
            "Kirchheim": "south",
            "Neuenheim": "north",
            "Pfaffengrund": "west",
            "Rohrbach": "south",
            "Schlierbach": "east",
            "Südstadt": "south",
            "Weststadt": "center",
            "Wieblingen": "west",
            "Ziegelhausen": "east"
        }
        return area[self.suburb]


class Venue(BaseModel):
    name: str
    description: str
    address: Address


class Hotel(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    stars: Literal["0", "1", "2", "3", "4", "5", "unknown"]
    air_conditioning: Literal["yes", "no", "unknown"]
    internet_access: Literal["yes", "no", "unknown"]
    parking: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "unknown"]


class Restaurant(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    internet_access: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "limited" "unknown"]
    bar: Literal["yes", "no", "unknown"]
    delivery: Literal["yes", "no", "unknown"]
    vegan: Literal["yes", "no", "limited", "unknown"]
    vegetarian: Literal["yes", "no", "limited", "unknown"]
    indoor_seating: Literal["yes", "no", "limited", "unknown"]
    outdoor_seating: Literal["yes", "no", "limited", "unknown"]
    cuisine: List[Literal[
        "african",
        "american",
        "asian",
        "chinese",
        "german",
        "greek",
        "indian",
        "international",
        "italian",
        "japanese",
        "korean",
        "oriental",
        "sushi",
        "thai",
    ]]


class Attraction(Venue):
    attraction_type: List[Literal[
        "zoo",
        "architecture",
        "view",
        "historic",
    ]]

'''
class Museum(Attraction):
    theme: str

#class Museum(Attraction):
#    theme: str

class Theatre(Attraction):
    program: List[Literal[
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday",
                "Saturday",
                "Sunday",
    ]]

#class Theatre(Attraction):
#    program: List[Literal[
#                "Monday",
#                "Tuesday",
#                "Wednesday",
#                "Thursday",
#                "Friday",
#                "Saturday",
#                "Sunday",
#    ]]


class TouristAttraction(Attraction):
    view: Literal["yes", "no", "unknown"]
'''
#class TouristAttraction(Attraction):
#    view: Literal["yes", "no", "unknown"]
```

%% Cell type:markdown id: tags:

We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing.

%% Cell type:code id: tags:

``` python
import json
import ndjson
from collections import defaultdict


restaurants = []
hotels = []
attractions = []

# Load data and split into types
with open("../data/osm/osm_export.geojson") as file:
    data = json.load(file)
    k = defaultdict(int)
    for node in data["features"]:
        node = node["properties"]
        if "amenity" in node.keys():
            if node["amenity"] == "restaurant":
                restaurants.append(node)
        if "tourism" in node.keys():
            if node["tourism"] == "hotel":
                hotels.append(node)
            if node["tourism"] == "attraction":
                attractions.append(node)
```

%% Cell type:markdown id: tags:

## Restaurants

Here we are processing the restaurants. We start by declaring our desired keys.
Then we remove all unnecessary keys and fill missing keys with a marker.

%% Cell type:code id: tags:

``` python
restaurant_keys = [
    "name",
    "description",
    "addr:street",
    "addr:housenumber",
    "addr:suburb",
    "phone",
    "contact:phone",
    "website",
    "contact:website",
    "address",
    "internet_access",
    "smoking",
    "wheelchair",
    "delivery",
    "vegan",
    "diet:vegan",
    "vegetarian",
    "diet:vegetarian",
    "indoor_seating",
    "outdoor_seating",
    "cuisine",
    "price",
]

# Setup 3 output files to easily split manual editing by 3
of0 = []
of1 = []
of2 = []

# Iterating over restaurants
i = 0
for r in restaurants:
    rk = list(r.keys())

    # Filter out the unwanted properties
    for key in rk:
        if key not in restaurant_keys:
            r.pop(key)

    # Merge inconsistent label
    if "diet:vegan" in rk:
        r["vegan"] = r["diet:vegan"]
        r.pop("diet:vegan")
    if "diet:vegetarian" in rk:
        r["vegetarian"] = r["diet:vegetarian"]
        r.pop("vegetarian")

    # Put a "unknown" marker inside empty properties
    for key in restaurant_keys:
        if key not in r.keys():
            if key == "diet:vegan" or key == "diet:vegetarian":
                continue
            r[key] = "unknown"

    if i == 0:
        of0.append(dict(sorted(r.items(), key=lambda item: item[0])))
    if i == 1:
        of1.append(dict(sorted(r.items(), key=lambda item: item[0])))
    if i == 2:
        of2.append(dict(sorted(r.items(), key=lambda item: item[0])))

    i += 1
    i %= 3



o0 = {"data": of0}
o1 = {"data": of1}
o2 = {"data": of2}

output0 = open("../data/osm/restaurants_1.json", "w")
output1 = open("../data/osm/restaurants_2.json", "w")
output2 = open("../data/osm/restaurants_3.json", "w")

output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

output0.close()
output1.close()
output2.close()
```