Commit d66a6412 authored by pracht's avatar pracht
Browse files

Generalizes preprocessing to all data types

parent 6969f0e4
Loading
Loading
Loading
Loading
+109 −102
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# HeidelKBerg

A knowledge base for retrieval-assisted generation of task-oriented dialogues.

%% Cell type:markdown id: tags:

## Data

We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query:

```sql
[out:json][timeout:25];
{{geocodeArea:Heidelberg}}->.searchArea;
(
  nwr["amenity"="restaurant"](area.searchArea);
  nwr["amenity"="museum"](area.searchArea);
  nwr["tourism"="hotel"](area.searchArea);
  nwr["tourism"="attraction"](area.searchArea);
);
out geom;
```

and export it to `@/data/osm/osm_export.geojson`.

%% Cell type:markdown id: tags:

## Schema

We define the schema of our knowledge base with pydantic types.

%% Cell type:code id: tags:

``` python
from typing import List, Literal
from pydantic import BaseModel, computed_field


class Address(BaseModel):
    street: str
    housenumber: str
    # suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html
    suburb: Literal[
        "Altstadt",
        "Bahnstadt",
        "Bergheim",
        "Boxberg",
        "Emmertsgrund",
        "Handschuhsheim",
        "Kirchheim",
        "Neuenheim",
        "Pfaffengrund",
        "Rohrbach",
        "Schlierbach",
        "Südstadt",
        "Weststadt",
        "Wieblingen",
        "Ziegelhausen"
    ]

    # Since all types that have an address have an area, we compute the area
    # on address level to minimize code duplicity, although it technically is
    # no address property
    @computed_field
    @property
    def area(self) -> Literal["center", "east", "north", "south", "west"]:
        area = {
            "Altstadt": "center",
            "Bahnstadt": "center",
            "Bergheim": "center",
            "Boxberg": "south",
            "Emmertsgrund": "south",
            "Handschuhsheim": "north",
            "Kirchheim": "south",
            "Neuenheim": "north",
            "Pfaffengrund": "west",
            "Rohrbach": "south",
            "Schlierbach": "east",
            "Südstadt": "south",
            "Weststadt": "center",
            "Wieblingen": "west",
            "Ziegelhausen": "east"
        }
        return area[self.suburb]


class Venue(BaseModel):
    name: str
    description: str
    address: Address


class Hotel(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    stars: Literal["0", "1", "2", "3", "4", "5", "unknown"]
    air_conditioning: Literal["yes", "no", "unknown"]
    internet_access: Literal["yes", "no", "unknown"]
    parking: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "unknown"]


class Restaurant(Venue):
    phone: str
    website: str
    price: Literal["cheap", "moderate", "expensive"]
    internet_access: Literal["yes", "no", "unknown"]
    smoking: Literal["yes", "no", "unknown"]
    wheelchair: Literal["yes", "no", "limited" "unknown"]
    bar: Literal["yes", "no", "unknown"]
    delivery: Literal["yes", "no", "unknown"]
    vegan: Literal["yes", "no", "limited", "unknown"]
    vegetarian: Literal["yes", "no", "limited", "unknown"]
    indoor_seating: Literal["yes", "no", "limited", "unknown"]
    outdoor_seating: Literal["yes", "no", "limited", "unknown"]
    cuisine: List[Literal[
        "african",
        "american",
        "asian",
        "chinese",
        "german",
        "greek",
        "indian",
        "international",
        "italian",
        "japanese",
        "korean",
        "oriental",
        "sushi",
        "thai",
    ]]


class Attraction(Venue):
    attraction_type: List[Literal[
        "zoo",
        "architecture",
        "view",
        "historic",
    ]]


#class Museum(Attraction):
#    theme: str


#class Theatre(Attraction):
#    program: List[Literal[
#                "Monday",
#                "Tuesday",
#                "Wednesday",
#                "Thursday",
#                "Friday",
#                "Saturday",
#                "Sunday",
#    ]]


#class TouristAttraction(Attraction):
#    view: Literal["yes", "no", "unknown"]
```

%% Cell type:markdown id: tags:

We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing.
We are using three output files to easier split the data for three manual annotators.

%% Cell type:code id: tags:

``` python
import json
import ndjson
from collections import defaultdict


restaurants = []
hotels = []
attractions = []

# Load data and split into types
with open("../data/osm/osm_export.geojson") as file:
    data = json.load(file)
    k = defaultdict(int)
    for node in data["features"]:
        node = node["properties"]
        if "amenity" in node.keys():
            if node["amenity"] == "restaurant":
                restaurants.append(node)
        if "tourism" in node.keys():
            if node["tourism"] == "hotel":
                hotels.append(node)
            if node["tourism"] == "attraction":
                attractions.append(node)
```

%% Cell type:markdown id: tags:

## Restaurants

Here we are processing the restaurants. We start by declaring our desired keys.
Then we remove all unnecessary keys and fill missing keys with a marker.

%% Cell type:code id: tags:

``` python
restaurant_keys = [
    "name",
    "description",
    "addr:street",
    "addr:housenumber",
    "addr:suburb",
    "phone",
    "contact:phone",
    "website",
    "contact:website",
    "internet_access",
    "smoking",
    "wheelchair",
    "delivery",
    "vegan",
    "diet:vegan",
    "vegetarian",
    "diet:vegetarian",
    "indoor_seating",
    "outdoor_seating",
    "cuisine",
    "price",
]

# Setup 3 output files to easily split manual editing by 3
of0 = []
of1 = []
of2 = []

# Iterating over restaurants
i = 0
for r in restaurants:
    rk = list(r.keys())

    # Filter out the unwanted properties
    for key in rk:
        if key not in restaurant_keys:
            r.pop(key)

    # Homogenize labels
    if "diet:vegan" in rk:
        r["vegan"] = r["diet:vegan"]
        r.pop("diet:vegan")
    if "diet:vegetarian" in rk:
        r["vegetarian"] = r["diet:vegetarian"]
        r.pop("diet:vegetarian")
    if "conact:phone" in rk:
        r["phone"] = r["contact:phone"]
        r.pop("contact:phone")
    if "conact:website" in rk:
        r["phone"] = r["contact:website"]
        r.pop("contact:website")

    superflous_labels = [
        "diet:vegan",
        "diet:vegetarian",
def write_data(label: str, data: list):
    """
    Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys.
    It then writes the data split into thre json files.

        @input label: str
        @input data: list

        @output void
    """

    # Define possible labels
    data_keys = [
        "name",
        "description",
        "addr:street",
        "addr:housenumber",
        "addr:suburb",
        "phone",
        "contact:phone",
        "website",
        "contact:website",
        "internet_access",
        "smoking",
        "wheelchair",
        "delivery",
        "vegan",
        "diet:vegan",
        "vegetarian",
        "diet:vegetarian",
        "indoor_seating",
        "outdoor_seating",
        "cuisine",
        "price",
    ]

    # Put a "unknown" marker inside empty properties
    for key in restaurant_keys:
        if key not in r.keys():
            if key in superflous_labels:
                continue
            r[key] = "unknown"


    if i == 0:
        of0.append(dict(sorted(r.items(), key=lambda item: item[0])))
    if i == 1:
        of1.append(dict(sorted(r.items(), key=lambda item: item[0])))
    if i == 2:
        of2.append(dict(sorted(r.items(), key=lambda item: item[0])))

    i += 1
    i %= 3



o0 = {"data": of0}
o1 = {"data": of1}
o2 = {"data": of2}

output0 = open("../data/osm/restaurants_1.json", "w")
output1 = open("../data/osm/restaurants_2.json", "w")
output2 = open("../data/osm/restaurants_3.json", "w")

output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

output0.close()
output1.close()
output2.close()
    # Prepare to split output data to 3 files
    of0 = []
    of1 = []
    of2 = []

    # Iterating over data points
    i = 0
    for data_point in data:
        rk = list(data_point.keys())

        # Filter out the unwanted properties
        for key in rk:
            if key not in data_keys:
                data_point.pop(key)

        # Homogenize labels
        if "diet:vegan" in rk:
            data_point["vegan"] = data_point["diet:vegan"]
            data_point.pop("diet:vegan")
        if "diet:vegetarian" in rk:
            data_point["vegetarian"] = data_point["diet:vegetarian"]
            data_point.pop("diet:vegetarian")
        if "conact:phone" in rk:
            data_point["phone"] = data_point["contact:phone"]
            data_point.pop("contact:phone")
        if "conact:website" in rk:
            data_point["phone"] = data_point["contact:website"]
            data_point.pop("contact:website")

        superflous_labels = [
            "diet:vegan",
            "diet:vegetarian",
            "contact:phone",
            "contact:website",
        ]

        # Put a "unknown" marker inside empty properties
        for key in data_keys:
            if key not in data_point.keys():
                if key in superflous_labels:
                    continue
                data_point[key] = "unknown"


        if i == 0:
            of0.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
        if i == 1:
            of1.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
        if i == 2:
            of2.append(dict(sorted(data_point.items(), key=lambda item: item[0])))

        i += 1
        i %= 3



    o0 = {"data": of0}
    o1 = {"data": of1}
    o2 = {"data": of2}

    output0 = open(f"../data/osm/{label}_1.json", "w")
    output1 = open(f"../data/osm/{label}_2.json", "w")
    output2 = open(f"../data/osm/{label}_3.json", "w")

    output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
    output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
    output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

    output0.close()
    output1.close()
    output2.close()

# Write preprocessed data to output files
write_data("hotels", hotels)
write_data("restaurants", restaurants)
write_data("attractions", attractions)
```