ori-data-pipeline/Sakefile.yaml

266 lines
9.0 KiB
YAML

#using sake instead of just because need file outputs for dependencies ...
#! P=process
#! I=inputs
#! O=output
#! C=csvsql --no-inference --snifflimit 0 --query
#
#skip for demo: demo dataset already has normalized names.
# txns-names:
# help: normalize account names and fix missing names
# (ignore) doc: |-
# a case statement with a when for each variant present in the input data,
# becauase the payee field varies (e.g. 'Bob Fischer', 'B. Fischer').
# Then also add missing names (some bank transactions don't have a payee)
# (ignore) WARNING: |-
# might need to check the input data over time, as new variants occur... Is there some kind of warning I can build in to see if there are any UNKNOWNS left over?
# also, the second query, 'fix missing names', sets all remaining UNKNOWNs to Bank. Is it possible to do that so there's remaining UNKNOWNs to detect?
# dependencies:
# - "$I/bank-transactions.csv"
# - "fix_missing_names.sql"
# - "query_normalize_names.sql" #not present in this demo dataset
# formula: >-
# $C "normalize_names.sql"
# --query "fix_missing_names.sql"
# < "$I/bank-transactions.csv"
# > $P/txns-names.csv
# output:
# - $P/txns-names.csv
txns-fromto:
help: convert transactions to format 'from — amount — to'
dependencies:
# - $P/txns-names.csv
- $I/bank-transactions.csv
formula: >-
$C 'select id, date, name as "from",
amount,
"shared" as "to", description
from stdin where cast(amount as real) > 0
union select id, date, "shared" as "from",
-amount,
name as "to", description
from stdin where cast(amount as real) < 0
order by id, date;'
< $I/bank-transactions.csv
> $P/txns-fromto.csv
output:
- $P/txns-fromto.csv
#skip for demo. In the real dataset, kWh-meters:users is many:one so the meter usages have to be grouped and summed.
#months added to output file manually.
# kwh-usage:
# help: everyone's usage per period, including kWh used and months active in period.
# (ignore) doc: |
# in the real dataset, some households user multiple kWh meters. We need one kWh usage figure per account,
# so we have to sum the amount, grouped by account.
# dependencies:
# - $I/kwh-meters.csv
# - $I/months.csv
# formula: >-
# $C 'with vbwsum as
# (select "end" - "start" as usage, period, account from meters),
# vb as(select period, account, cast(sum(usage) as int) as usage
# from vbwsum
# group by period, account)
# select vb.*, months.months from vb
# inner join months on vb.period = months.period
# and vb.account = months.account;'
# --tables meters,months
# $I/kwh-meters.csv
# $I/months.csv
# > $P/kwh-usage.csv
# output:
# - $P/kwh-usage.csv
bank-costs:
help: extract totals for each supplier from initial transaction table.
(ignore) doc: |
This gets calculated so that it gets updated when bank transactions input file changes. Only Bank is actually used for now...
dependencies:
- $P/txns-fromto.csv
formula: >-
$C 'select "to" as account, cast(round(sum(amount), 2) as string) as amount from stdin
where "to" = "Bank"
group by "to";'
< $P/txns-fromto.csv
> $P/bank-costs.csv
output:
- $P/bank-costs.csv
total-val-txns:
help: reverse value transactions for electricity usage variable and fixed, and banking costs
(ignore) doc: |
Doing this calculation is the main premise of this whole pipeline.
To figure out how much all the users owe or are owed by the shared account,
we allocate the final costs of energy delivered (from the invoices) to each account
based on their proportion of kWh used (that gets done in the next step, indiv-val-txns).
And then in the step txns-with-generated, we add these as 'virtual' transactions in reverse, from the Energy company to the shared account,
and then from the shared account to the individual users.
If everyone's account is settled correctly, then in the final balance every account should be at zero.
dependencies:
- $I/collective-usage-expenses.csv
- $P/bank-costs.csv
formula: >-
$C '
with bank as (
select account || "total" as id, "2025-09-01" as date, account, amount as amount, "value of banking costs" as description
from sup where account = "Bank"
)
select period || account || "valvar" as id,
enddate as date,
account as "from",
exp_var as amount,
"shared" as "to",
"value of variable electricity expenses" as description
from elec
union all select period || account || "valfixed" as id,
enddate as date,
account as "from",
exp_fixed as amount,
"shared" as "to",
"value of fixed electricity expenses" as description
from elec
union all select id, date, account, amount, "shared", description from bank
order by id, date, account;'
--tables elec,sup
$I/collective-usage-expenses.csv
$P/bank-costs.csv
> $P/total-val-txns.csv
output:
- $P/total-val-txns.csv
#Javascript version: uses percent-divide.js
indiv-val-txns:
help: create txns for value consumed by each household
(ignore) doc: |
this step gets done in Javascript because sqlite can't do accurate floating point division.
We need to allocate a proportion of the total expenses to each user:
- collective energy usage * proportion of kWh per user
- banking costs * months the user was participating (Sarah joined in the last month of period 2).
based on
The math is still not 100% accurate, but it's good enough for the current purposes.
(the errors are to the order of 10e-10).
Possible improvement: use Big.js to do the arithmetic, though there might still be a final rounding error at the end.
dependencies:
- $I/collective-usage-expenses.csv
- $P/bank-costs.csv
- $I/kwh-usage.csv
- percent-divide.js
formula: >-
deno run -A percent-divide.js $I/collective-usage-expenses.csv $P/bank-costs.csv $I/kwh-usage.csv
> $P/indiv-val-txns.csv
output:
- $P/indiv-val-txns.csv
#Web Origami version: uses data.ori
#indiv-val-txns:
# help: create txns for value consumed by each household
# (ignore) doc: |
# this step gets done in Javascript because sqlite can't do accurate floating point division.
# We need to allocate a proportion of the total expenses to each user:
# - collective energy usage * proportion of kWh per user
# - banking costs * months the user was participating (Sarah joined in the last month of period 2).
# based on
# The math is still not 100% accurate, but it's good enough for the current purposes.
# (the errors are to the order of 10e-10).
# Possible improvement: use Big.js to do the arithmetic, though there might still be a final rounding error at the end.
# dependencies:
# - $I/collective-usage-expenses.csv
# - $P/bank-costs.csv
# - $I/kwh-usage.csv
# - data.ori
##there are more dependencies -- all the JS scripts used in data.ori -- but it doesn't make sense to include those all here.
# formula: >-
# ori 'Origami.csv data.ori/txns' > $P/indiv-val-txns.csv
# output:
# - $P/indiv-val-txns.csv
txns-with-other:
help: manually add txns which were not in input bank transaction csv
(ignore) doc: |
we add this in the 'fromto' step.
dependencies:
- $P/txns-fromto.csv
- $I/txns-other.csv
formula: >-
$C 'select * from "txns-fromto" union all
select * from "txns-other"
order by date, id;'
$P/txns-fromto.csv
$I/txns-other.csv
> $P/txns-with-other.csv
output:
- $P/txns-with-other.csv
txns-with-generated:
help: add the generated value transactions.
dependencies:
- $P/txns-with-other.csv
- $P/total-val-txns.csv
- $P/indiv-val-txns.csv
formula: >-
$C 'select * from "txns-with-other"
union all select * from "total-val-txns"
union all select * from "indiv-val-txns"
order by date, id;'
$P/txns-with-other.csv
$P/total-val-txns.csv
$P/indiv-val-txns.csv
> $P/txns-with-generated.csv
output:
- $P/txns-with-generated.csv
txns:
help: expand transactions to two rows per transaction.
(ignore) doc: By expanding the transactions to two entries, one for the 'from' account and one for the 'to' account, we can do double-entry accounting (lite)
dependencies:
- $P/txns-with-generated.csv
formula: >-
$C 'select id, date,
"from" as account,
-amount as amount,
description
from stdin union all
select id, date,
"to" as account,
amount,
description
from stdin
order by date, id;'
< $P/txns-with-generated.csv
> $P/txns.csv
output:
- $P/txns.csv
balance:
help: get total balance
dependencies:
- $P/txns.csv
formula: >-
$C 'with total as (
select "total" as account, sum(amount) as amount
from stdin
), pre as (
select account,
round(sum(amount),2) as amount
from stdin
where id is not null
group by account
)
select * from pre union all select * from total;'
< $P/txns.csv
>$O/balance.csv
output:
- $O/balance.csv