# import glob
# from xgboost_ray import RayDMatrix, RayFileType
# import ray
# from ray.train.xgboost import XGBoostTrainer
# from ray.air.config import ScalingConfig
# from ray.air.config import RunConfig
# # We can also pass a list of files
# #path = list(sorted(glob.glob("/root/aws-samples-for-ray/glue/data.parquet")))
# # This argument will be passed to `pd.read_parquet()`
# columns = [
# "passenger_count",
# "trip_distance",
# # "pickup_longitude", # these are part of some years but not others
# # "pickup_latitude",
# # "dropoff_longitude",
# # "dropoff_latitude",
# "payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "total_amount"
# ]
# # .drop_columns(cols=["vendor_id", "pickup_at", "dropoff_at", "rate_code_id", "store_and_fwd_flag", "payment_type"]) \
# # filter out any rows missing the total_amount column
# # (the other option is to
# #import pyarrow as pa
# # filter_expr = (
# # (pa.dataset.field("total_amount") >= 0.0)
# # )
# # ray.init(
# # _system_config={
# # "max_io_workers": 4, # More IO workers for remote storage.
# # "min_spilling_size": 100 * 1024 * 1024, # Spill at least 100MB at a time.
# # "object_spilling_config": json.dumps(
# # {
# # "type": "smart_open",
# # "params": {
# # "uri": "s3://dsoaws/ray-glue-xgboost-nyc-taxi-spill"
# # },
# # "buffer_size": 100 * 1024 * 1024, # Use a 100MB buffer for writes
# # },
# # )
# # },
# # )
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2019/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2018/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2017/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2016/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2015/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2014/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2013/")
# # [SUCCEEDED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2012/")
# # Note: I removed [FAILED] directories from s3://dsoaws-databricks/...
# # [FAILED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2011/")
# # [FAILED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2010/")
# # [FAILED] df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/2009/")
# # df = spark.read.option("recursiveFileLookup", "true").parquet("s3://dsoaws-databricks/nyc-taxi/")
# # dataset = ray.data.read_parquet([
# # 's3://dsoaws/nyc-taxi/ride-metadata/year=2012/',
# # ],
# # columns=columns,
# # # filter=(pa.field("total_amount") >= 0.0) # this worked briefly, then stopped working for some reason
# # )
# dataset.count()