Loading notebooks/knowledge_base.ipynb +2 −2 Original line number Diff line number Diff line %% Cell type:markdown id: tags: # HeidelKBerg A knowledge base for retrieval-assisted generation of task-oriented dialogues. %% Cell type:markdown id: tags: ## Data We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query: ```sql [out:json][timeout:25]; {{geocodeArea:Heidelberg}}->.searchArea; ( nwr["amenity"="restaurant"](area.searchArea); nwr["amenity"="museum"](area.searchArea); nwr["tourism"="hotel"](area.searchArea); nwr["tourism"="attraction"](area.searchArea); ); out geom; ``` and export it to `@/data/osm/osm_export.geojson`. %% Cell type:markdown id: tags: ## Schema We define the schema of our knowledge base with pydantic types. %% Cell type:code id: tags: ``` python from typing import List, Literal from pydantic import BaseModel, computed_field class Address(BaseModel): street: str housenumber: str # suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html suburb: Literal[ "Altstadt", "Bahnstadt", "Bergheim", "Boxberg", "Emmertsgrund", "Handschuhsheim", "Kirchheim", "Neuenheim", "Pfaffengrund", "Rohrbach", "Schlierbach", "Südstadt", "Weststadt", "Wieblingen", "Ziegelhausen" ] # Since all types that have an address have an area, we compute the area # on address level to minimize code duplicity, although it technically is # no address property @computed_field @property def area(self) -> Literal["center", "east", "north", "south", "west"]: area = { "Altstadt": "center", "Bahnstadt": "center", "Bergheim": "center", "Boxberg": "south", "Emmertsgrund": "south", "Handschuhsheim": "north", "Kirchheim": "south", "Neuenheim": "north", "Pfaffengrund": "west", "Rohrbach": "south", "Schlierbach": "east", "Südstadt": "south", "Weststadt": "center", "Wieblingen": "west", "Ziegelhausen": "east" } return area[self.suburb] class Venue(BaseModel): name: str description: str address: Address class Hotel(Venue): phone: str website: str price: Literal["cheap", "moderate", "expensive"] stars: Literal["0", "1", "2", "3", "4", "5", "unknown"] air_conditioning: Literal["yes", "no", "unknown"] internet_access: Literal["yes", "no", "unknown"] parking: Literal["yes", "no", "unknown"] smoking: Literal["yes", "no", "unknown"] wheelchair: Literal["yes", "no", "unknown"] class Restaurant(Venue): phone: str website: str price: Literal["cheap", "moderate", "expensive"] internet_access: Literal["yes", "no", "unknown"] smoking: Literal["yes", "no", "unknown"] wheelchair: Literal["yes", "no", "limited" "unknown"] bar: Literal["yes", "no", "unknown"] delivery: Literal["yes", "no", "unknown"] vegan: Literal["yes", "no", "limited", "unknown"] vegetarian: Literal["yes", "no", "limited", "unknown"] indoor_seating: Literal["yes", "no", "limited", "unknown"] outdoor_seating: Literal["yes", "no", "limited", "unknown"] cuisine: List[Literal[ "african", "american", "asian", "chinese", "german", "greek", "indian", "international", "italian", "japanese", "korean", "oriental", "sushi", "thai", ]] class Attraction(Venue): attraction_type: List[Literal[ "zoo", "architecture", "view", "historic", ]] #class Museum(Attraction): # theme: str #class Theatre(Attraction): # program: List[Literal[ # "Monday", # "Tuesday", # "Wednesday", # "Thursday", # "Friday", # "Saturday", # "Sunday", # ]] #class TouristAttraction(Attraction): # view: Literal["yes", "no", "unknown"] ``` %% Cell type:markdown id: tags: We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing. We are using three output files to easier split the data for three manual annotators. %% Cell type:code id: tags: ``` python import json from collections import defaultdict restaurants = [] hotels = [] attractions = [] # Load data and split into types with open("../data/osm/osm_export.geojson") as file: data = json.load(file) k = defaultdict(int) for node in data["features"]: node = node["properties"] if "amenity" in node.keys(): if node["amenity"] == "restaurant": restaurants.append(node) if "tourism" in node.keys(): if node["tourism"] == "hotel": hotels.append(node) if node["tourism"] == "attraction": attractions.append(node) def write_data(label: str, data: list): """ Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys. It then writes the data split into thre json files. @input label: str @input data: list @output void """ # Define possible labels data_keys = [ "name", "description", "addr:street", "addr:housenumber", "addr:suburb", "phone", "contact:phone", "website", "contact:website", "internet_access", "smoking", "wheelchair", "delivery", "vegan", "diet:vegan", "vegetarian", "diet:vegetarian", "indoor_seating", "outdoor_seating", "cuisine", "price", "type", "edited", ] # Prepare to split output data to 3 files of0 = [] of1 = [] of2 = [] # Iterating over data points i = 0 for data_point in data: rk = list(data_point.keys()) # Filter out the unwanted properties for key in rk: if key not in data_keys: data_point.pop(key) # Homogenize labels if "diet:vegan" in rk: data_point["vegan"] = data_point["diet:vegan"] data_point.pop("diet:vegan") if "diet:vegetarian" in rk: data_point["vegetarian"] = data_point["diet:vegetarian"] data_point.pop("diet:vegetarian") if "conact:phone" in rk: data_point["phone"] = data_point["contact:phone"] data_point.pop("contact:phone") if "conact:website" in rk: data_point["phone"] = data_point["contact:website"] data_point.pop("contact:website") superflous_labels = [ "diet:vegan", "diet:vegetarian", "contact:phone", "contact:website", ] # Put a "unknown" marker inside empty properties for key in data_keys: if key not in data_point.keys(): if key in superflous_labels: continue if key == "type": elif key == "type": data_point["type"] = label.rstrip("s") if key == "edited": elif key == "edited": data_point["edited"] = False else: data_point[key] = "unknown" if i == 0: of0.append(dict(sorted(data_point.items(), key=lambda item: item[0]))) if i == 1: of1.append(dict(sorted(data_point.items(), key=lambda item: item[0]))) if i == 2: of2.append(dict(sorted(data_point.items(), key=lambda item: item[0]))) i += 1 i %= 3 o0 = {"data": of0} o1 = {"data": of1} o2 = {"data": of2} output0 = open(f"../data/osm/{label}_1.json", "w") output1 = open(f"../data/osm/{label}_2.json", "w") output2 = open(f"../data/osm/{label}_3.json", "w") output0.write(json.dumps(o0, indent=2, ensure_ascii=False)) output1.write(json.dumps(o1, indent=2, ensure_ascii=False)) output2.write(json.dumps(o2, indent=2, ensure_ascii=False)) output0.close() output1.close() output2.close() # Write preprocessed data to output files write_data("hotels", hotels) write_data("restaurants", restaurants) write_data("attractions", attractions) ``` Loading
notebooks/knowledge_base.ipynb +2 −2 Original line number Diff line number Diff line %% Cell type:markdown id: tags: # HeidelKBerg A knowledge base for retrieval-assisted generation of task-oriented dialogues. %% Cell type:markdown id: tags: ## Data We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query: ```sql [out:json][timeout:25]; {{geocodeArea:Heidelberg}}->.searchArea; ( nwr["amenity"="restaurant"](area.searchArea); nwr["amenity"="museum"](area.searchArea); nwr["tourism"="hotel"](area.searchArea); nwr["tourism"="attraction"](area.searchArea); ); out geom; ``` and export it to `@/data/osm/osm_export.geojson`. %% Cell type:markdown id: tags: ## Schema We define the schema of our knowledge base with pydantic types. %% Cell type:code id: tags: ``` python from typing import List, Literal from pydantic import BaseModel, computed_field class Address(BaseModel): street: str housenumber: str # suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html suburb: Literal[ "Altstadt", "Bahnstadt", "Bergheim", "Boxberg", "Emmertsgrund", "Handschuhsheim", "Kirchheim", "Neuenheim", "Pfaffengrund", "Rohrbach", "Schlierbach", "Südstadt", "Weststadt", "Wieblingen", "Ziegelhausen" ] # Since all types that have an address have an area, we compute the area # on address level to minimize code duplicity, although it technically is # no address property @computed_field @property def area(self) -> Literal["center", "east", "north", "south", "west"]: area = { "Altstadt": "center", "Bahnstadt": "center", "Bergheim": "center", "Boxberg": "south", "Emmertsgrund": "south", "Handschuhsheim": "north", "Kirchheim": "south", "Neuenheim": "north", "Pfaffengrund": "west", "Rohrbach": "south", "Schlierbach": "east", "Südstadt": "south", "Weststadt": "center", "Wieblingen": "west", "Ziegelhausen": "east" } return area[self.suburb] class Venue(BaseModel): name: str description: str address: Address class Hotel(Venue): phone: str website: str price: Literal["cheap", "moderate", "expensive"] stars: Literal["0", "1", "2", "3", "4", "5", "unknown"] air_conditioning: Literal["yes", "no", "unknown"] internet_access: Literal["yes", "no", "unknown"] parking: Literal["yes", "no", "unknown"] smoking: Literal["yes", "no", "unknown"] wheelchair: Literal["yes", "no", "unknown"] class Restaurant(Venue): phone: str website: str price: Literal["cheap", "moderate", "expensive"] internet_access: Literal["yes", "no", "unknown"] smoking: Literal["yes", "no", "unknown"] wheelchair: Literal["yes", "no", "limited" "unknown"] bar: Literal["yes", "no", "unknown"] delivery: Literal["yes", "no", "unknown"] vegan: Literal["yes", "no", "limited", "unknown"] vegetarian: Literal["yes", "no", "limited", "unknown"] indoor_seating: Literal["yes", "no", "limited", "unknown"] outdoor_seating: Literal["yes", "no", "limited", "unknown"] cuisine: List[Literal[ "african", "american", "asian", "chinese", "german", "greek", "indian", "international", "italian", "japanese", "korean", "oriental", "sushi", "thai", ]] class Attraction(Venue): attraction_type: List[Literal[ "zoo", "architecture", "view", "historic", ]] #class Museum(Attraction): # theme: str #class Theatre(Attraction): # program: List[Literal[ # "Monday", # "Tuesday", # "Wednesday", # "Thursday", # "Friday", # "Saturday", # "Sunday", # ]] #class TouristAttraction(Attraction): # view: Literal["yes", "no", "unknown"] ``` %% Cell type:markdown id: tags: We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing. We are using three output files to easier split the data for three manual annotators. %% Cell type:code id: tags: ``` python import json from collections import defaultdict restaurants = [] hotels = [] attractions = [] # Load data and split into types with open("../data/osm/osm_export.geojson") as file: data = json.load(file) k = defaultdict(int) for node in data["features"]: node = node["properties"] if "amenity" in node.keys(): if node["amenity"] == "restaurant": restaurants.append(node) if "tourism" in node.keys(): if node["tourism"] == "hotel": hotels.append(node) if node["tourism"] == "attraction": attractions.append(node) def write_data(label: str, data: list): """ Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys. It then writes the data split into thre json files. @input label: str @input data: list @output void """ # Define possible labels data_keys = [ "name", "description", "addr:street", "addr:housenumber", "addr:suburb", "phone", "contact:phone", "website", "contact:website", "internet_access", "smoking", "wheelchair", "delivery", "vegan", "diet:vegan", "vegetarian", "diet:vegetarian", "indoor_seating", "outdoor_seating", "cuisine", "price", "type", "edited", ] # Prepare to split output data to 3 files of0 = [] of1 = [] of2 = [] # Iterating over data points i = 0 for data_point in data: rk = list(data_point.keys()) # Filter out the unwanted properties for key in rk: if key not in data_keys: data_point.pop(key) # Homogenize labels if "diet:vegan" in rk: data_point["vegan"] = data_point["diet:vegan"] data_point.pop("diet:vegan") if "diet:vegetarian" in rk: data_point["vegetarian"] = data_point["diet:vegetarian"] data_point.pop("diet:vegetarian") if "conact:phone" in rk: data_point["phone"] = data_point["contact:phone"] data_point.pop("contact:phone") if "conact:website" in rk: data_point["phone"] = data_point["contact:website"] data_point.pop("contact:website") superflous_labels = [ "diet:vegan", "diet:vegetarian", "contact:phone", "contact:website", ] # Put a "unknown" marker inside empty properties for key in data_keys: if key not in data_point.keys(): if key in superflous_labels: continue if key == "type": elif key == "type": data_point["type"] = label.rstrip("s") if key == "edited": elif key == "edited": data_point["edited"] = False else: data_point[key] = "unknown" if i == 0: of0.append(dict(sorted(data_point.items(), key=lambda item: item[0]))) if i == 1: of1.append(dict(sorted(data_point.items(), key=lambda item: item[0]))) if i == 2: of2.append(dict(sorted(data_point.items(), key=lambda item: item[0]))) i += 1 i %= 3 o0 = {"data": of0} o1 = {"data": of1} o2 = {"data": of2} output0 = open(f"../data/osm/{label}_1.json", "w") output1 = open(f"../data/osm/{label}_2.json", "w") output2 = open(f"../data/osm/{label}_3.json", "w") output0.write(json.dumps(o0, indent=2, ensure_ascii=False)) output1.write(json.dumps(o1, indent=2, ensure_ascii=False)) output2.write(json.dumps(o2, indent=2, ensure_ascii=False)) output0.close() output1.close() output2.close() # Write preprocessed data to output files write_data("hotels", hotels) write_data("restaurants", restaurants) write_data("attractions", attractions) ```