|
|
|
import os
|
|
|
|
|
|
|
|
sep = os.sep
|
|
|
|
|
|
|
|
# toggles
|
|
|
|
dataset = 'electricity' # [covtype, electricity, mixed, phishing, power]
|
|
|
|
use_threadpool = False # True
|
|
|
|
|
|
|
|
# environments
|
|
|
|
input_prefix = f'data{sep}{dataset}_orig'
|
|
|
|
output_prefix = f'data{sep}{dataset}'
|
|
|
|
sep_field = b','
|
|
|
|
sep_subfield = b';'
|
|
|
|
|
|
|
|
lst_files = os.listdir(input_prefix)
|
|
|
|
# lst_files.sort()
|
|
|
|
|
|
|
|
try:
|
|
|
|
os.mkdir(output_prefix)
|
|
|
|
except FileExistsError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
def process(f : str):
|
|
|
|
filename = input_prefix + sep + f
|
|
|
|
ofilename = output_prefix + sep + f[:-3] + 'csv'
|
|
|
|
with open(filename, 'rb') as ifile:
|
|
|
|
icontents = ifile.read()
|
|
|
|
with open(ofilename, 'wb') as ofile:
|
|
|
|
ofile.write(b'\n')
|
|
|
|
for l in icontents.splitlines():
|
|
|
|
fields = l.strip().split(b' ')
|
|
|
|
subfields = fields[:-1]
|
|
|
|
ol = ( # fields[0] + sep_field +
|
|
|
|
sep_subfield.join(subfields) +
|
|
|
|
sep_field + fields[-1] + b'\n')
|
|
|
|
ofile.write(ol)
|
|
|
|
|
|
|
|
if not use_threadpool:
|
|
|
|
for f in lst_files:
|
|
|
|
process(f)
|
|
|
|
elif __name__ == '__main__':
|
|
|
|
from multiprocessing import Pool
|
|
|
|
with Pool(8) as tp:
|
|
|
|
tp.map(process, lst_files)
|