BlazingSQL

BlazingSQL Documentation

Welcome to our Documentation and Support Page!

BlazingSQL is a GPU accelerated SQL engine built on top of the RAPIDS data science framework. RAPIDS is a collection of open-source libraries for end-to-end data science pipelines entirely in the GPU. BlazingSQL extends RAPIDS and enables users to run SQL queries on Apache Arrow in GPU memory.

Please install, test, deploy, and gripe in our Discussion board.

Get Started    Discussions

File System

Connect BlazingSQL to distributed file systems.

Register External Filesystems

BlazingSQL is for connecting Data Lakes to RAPIDS, as such you can register filesystems. We currently support the following:

  • Local
  • AWS S3
  • HDFS

Once you register a new filesystem, you can only need to modify the file paths in the examples above to use the filesystem you registered.

Register Local Filesystem

import cudf as cudf
import pyblazing
from pyblazing import DriverType, FileSystemType, EncryptionType
from pyblazing import SchemaFrom

def register_localfs():
    print('*** Register a POSIX File System ***')
    fs_status = pyblazing.register_file_system(
        authority="tpch",
        type=FileSystemType.POSIX,
        root="/"
    )
    print(fs_status)
    
register_localfs()

names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']
dtypes = ['int32', 'int64', 'int32', 'int64']

nation_schema = pyblazing.register_table_schema(table_name='nation', type=SchemaFrom.CsvFile, path='/tmp/tpch/1mb/nation.psv', delimiter='|', dtypes=dtypes, names=names)
table_data = {
    nation_schema: ['file://tpch/1mb/nation.psv']
}

sql = 'select n_nationkey, n_regionkey + n_nationkey as addition from main.nation'

result_gdf = pyblazing.run_query_filesystem(sql, table_data)
print(sql)
print(result_gdf)

Register AWS S3

import cudf as cudf
import pyblazing
from pyblazing import DriverType, FileSystemType, EncryptionType
from pyblazing import SchemaFrom

def register_s3():
    print('*** Register an S3 File System ***')
    fs_status = pyblazing.register_file_system(
        authority="tpch_s3",
        type=FileSystemType.S3,
        root="/",
        params={
            "bucketName": "s3_bucket",
            "encryptionType": EncryptionType.NONE,
            "kmsKeyAmazonResourceName": "",
            "accessKeyId": "accessKeyIddsf3",
            "secretKey": "secretKey234",
            "sessionToken": ""
        }
    )
    print(fs_status)
    
register_s3()

names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']
dtypes = ['int32', 'int64', 'int32', 'int64']

nation_schema = pyblazing.register_table_schema(table_name='nation', type=SchemaFrom.CsvFile, path='s3://tpch_s3/Data1Mb/nation_0_0.psv', delimiter='|', dtypes=dtypes, names=names)
table_data = {
    nation_schema: ['s3://tpch_s3/Data1Mb/nation_0_0.psv']
}

sql = 'select n_nationkey, n_regionkey + n_nationkey as addition from main.nation'

result_gdf = pyblazing.run_query_filesystem(sql, table_data)
print(sql)
print(result_gdf)    

Register HDFS

import cudf as cudf
import pyblazing
from pyblazing import DriverType, FileSystemType, EncryptionType
from pyblazing import SchemaFrom

def register_hdfs():
    print('*** Register a HDFS File System ***')
    fs_status = pyblazing.register_file_system(
        authority="tpch_hdfs",
        type=FileSystemType.HDFS,
        root="/",
        params={
            "host": "127.0.0.1",
            "port": 54310,
            "user": "hadoop",
            "driverType": DriverType.LIBHDFS3,
            "kerberosTicket": ""
        }
    )
    print(fs_status)    
    
register_hdfs()

names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']
dtypes = ['int32', 'int64', 'int32', 'int64']

nation_schema = pyblazing.register_table_schema(table_name='nation', type=SchemaFrom.CsvFile, path='hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv', delimiter='|', dtypes=dtypes, names=names)
table_data = {
    nation_schema: ['hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv']
}

sql = 'select n_nationkey, n_regionkey + n_nationkey as addition from main.nation'

result_gdf = pyblazing.run_query_filesystem(sql, table_data)
print(sql)
print(result_gdf)  

What's Next

Now learn how to query different files on these file systems.

Data Definition Language (DDL)