diff --git a/docs/v3/contributing.mdx b/docs/v3/contributing.mdx index 3babc20ae..989749e77 100644 --- a/docs/v3/contributing.mdx +++ b/docs/v3/contributing.mdx @@ -64,7 +64,7 @@ make spell_fix We use `pytest` to test our code. You can run the tests by running the following command: ```bash -make tests +make test_all ``` Make sure that all tests pass before submitting a pull request. diff --git a/docs/v3/semantic-layer.mdx b/docs/v3/semantic-layer.mdx index dfebda560..6d1582572 100644 --- a/docs/v3/semantic-layer.mdx +++ b/docs/v3/semantic-layer.mdx @@ -27,20 +27,22 @@ df.save( path="company/sales-data", # Format: "organization/dataset" name="sales-data", # Human-readable name description="Sales data from our retail stores", # Optional description - columns={ - "transaction_id": { + columns=[ + { + "name": "transaction_id", "type": "string", "description": "Unique identifier for each sale" }, - "sale_date": { + { + "name": "sale_date" "type": "datetime", "description": "Date and time of the sale" } - } + ] ) ``` -#### name +#### name The name field identifies your dataset in the save method. @@ -84,28 +86,33 @@ df.save( path="company/sales-data", name="sales-data", description="Daily sales transactions from all retail stores", - columns={ - "transaction_id": { + columns=[ + { + "name": "transaction_id", "type": "string", "description": "Unique identifier for each sale" }, - "sale_date": { + { + "name": "sale_date" "type": "datetime", "description": "Date and time of the sale" }, - "quantity": { + { + "name": "quantity", "type": "integer", "description": "Number of units sold" }, - "price": { + { + "name": "price", "type": "float", "description": "Price per unit in USD" }, - "is_online": { + { + "name": "is_online", "type": "boolean", "description": "Whether the sale was made online" } - } + ] ) ``` @@ -238,7 +245,7 @@ source: - `connection_string` (str): Connection string for the data source - `query` (str): Query to retrieve data from the data source - +{/* commented as destination and update frequency will be only in the materialized case #### destination (mandatory) Specify the destination for your dataset. @@ -256,6 +263,7 @@ destination: path: /path/to/data ``` + #### update_frequency Specify the frequency of updates for your dataset. @@ -268,6 +276,7 @@ Specify the frequency of updates for your dataset. ```yaml update_frequency: daily ``` +*/} #### order_by Specify the columns to order by. diff --git a/pandasai/data_loader/loader.py b/pandasai/data_loader/loader.py index 2b0dd00dc..a30e7d12e 100644 --- a/pandasai/data_loader/loader.py +++ b/pandasai/data_loader/loader.py @@ -72,7 +72,7 @@ def _get_abs_dataset_path(self): return os.path.join(find_project_root(), "datasets", self.dataset_path) def _load_schema(self): - schema_path = os.path.join(self._get_abs_dataset_path(), "schema.yaml") + schema_path = os.path.join(str(self._get_abs_dataset_path()), "schema.yaml") if not os.path.exists(schema_path): raise FileNotFoundError(f"Schema file not found: {schema_path}") diff --git a/pandasai/data_loader/semantic_layer_schema.py b/pandasai/data_loader/semantic_layer_schema.py index 8adbf4a7a..b5c922188 100644 --- a/pandasai/data_loader/semantic_layer_schema.py +++ b/pandasai/data_loader/semantic_layer_schema.py @@ -1,5 +1,5 @@ import json -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import yaml from pydantic import ( @@ -19,21 +19,23 @@ class Column(BaseModel): name: str = Field(..., description="Name of the column.") - type: str = Field(..., description="Data type of the column.") + type: Optional[str] = Field(None, description="Data type of the column.") description: Optional[str] = Field(None, description="Description of the column") @field_validator("type") @classmethod def is_column_type_supported(cls, type: str) -> str: if type not in VALID_COLUMN_TYPES: - raise ValueError(f"Unsupported column type: {type}") + raise ValueError( + f"Unsupported column type: {type}. Supported types are: {VALID_COLUMN_TYPES}" + ) return type class Transformation(BaseModel): type: str = Field(..., description="Type of transformation to be applied.") - params: Dict[str, str] = Field( - ..., description="Parameters for the transformation." + params: Optional[Dict[str, str]] = Field( + None, description="Parameters for the transformation." ) @field_validator("type") @@ -95,13 +97,13 @@ def is_format_supported(cls, format: str) -> str: class SemanticLayerSchema(BaseModel): name: str = Field(..., description="Dataset name.") + source: Source = Field(..., description="Data source for your dataset.") description: Optional[str] = Field( None, description="Dataset’s contents and purpose description." ) columns: Optional[List[Column]] = Field( None, description="Structure and metadata of your dataset’s columns" ) - source: Source = Field(..., description="Data source for your dataset.") order_by: Optional[List[str]] = Field( None, description="Ordering criteria for the dataset." ) @@ -118,8 +120,11 @@ class SemanticLayerSchema(BaseModel): None, description="Frequency of dataset updates." ) + def to_dict(self) -> dict[str, Any]: + return self.model_dump(exclude_none=True) + def to_yaml(self) -> str: - return yaml.dump(self.model_dump(), sort_keys=False) + return yaml.dump(self.to_dict(), sort_keys=False) def is_schema_source_same( diff --git a/pandasai/dataframe/base.py b/pandasai/dataframe/base.py index 8c1058393..d90a3b786 100644 --- a/pandasai/dataframe/base.py +++ b/pandasai/dataframe/base.py @@ -176,22 +176,23 @@ def _create_yml_template( columns_dict: dictionary with info about columns of the dataframe """ - columns = list(map(lambda column: Column(**column), columns_dict)) + if columns_dict: + columns_dict = list(map(lambda column: Column(**column), columns_dict)) schema = SemanticLayerSchema( name=name, description=description, - columns=columns, + columns=columns_dict, source=Source(type="parquet", path="data.parquet"), destination=Destination( type="local", format="parquet", path="data.parquet" ), ) - return schema.model_dump() + return schema.to_dict() def save( - self, path: str, name: str, description: str = None, columns: List[dict] = [] + self, path: str, name: str, description: str = None, columns: List[dict] = None ): self.name = name self.description = description diff --git a/tests/unit_tests/dataframe/test_dataframe.py b/tests/unit_tests/dataframe/test_dataframe.py index 10be08a5a..5989ba4b3 100644 --- a/tests/unit_tests/dataframe/test_dataframe.py +++ b/tests/unit_tests/dataframe/test_dataframe.py @@ -112,24 +112,18 @@ def test_save_creates_correct_schema(self, sample_df): "name": name, "description": description, "columns": [ - {"name": "Name", "type": "string", "description": None}, - {"name": "Age", "type": "integer", "description": None}, + {"name": "Name", "type": "string"}, + {"name": "Age", "type": "integer"}, ], "destination": { "format": "parquet", "path": "data.parquet", "type": "local", }, - "limit": None, - "order_by": None, "source": { - "connection": None, "path": "data.parquet", - "table": None, "type": "parquet", }, - "transformations": None, - "update_frequency": None, } mock_yaml_dump.assert_called_once_with(