Code
library(tidyverse)
library(openxlsx)
library(readxl)
Tony Duan
Data input and ouput in R
# A tibble: 2 × 2
a b
<dbl> <chr>
1 1241 rhth
2 35235 rjyyj
read CSV online
# A tibble: 6 × 32
hotel is_canceled lead_time arrival_date_year arrival_date_month
<chr> <dbl> <dbl> <dbl> <chr>
1 Resort Hotel 0 342 2015 July
2 Resort Hotel 0 737 2015 July
3 Resort Hotel 0 7 2015 July
4 Resort Hotel 0 13 2015 July
5 Resort Hotel 0 14 2015 July
6 Resort Hotel 0 14 2015 July
# ℹ 27 more variables: arrival_date_week_number <dbl>,
# arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
# stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
# meal <chr>, country <chr>, market_segment <chr>,
# distribution_channel <chr>, is_repeated_guest <dbl>,
# previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
# reserved_room_type <chr>, assigned_room_type <chr>, …
must install in this way,otherwise will report error. ::: {.cell}
:::
Arrow package version: 19.0.1
Capabilities:
acero TRUE
dataset TRUE
substrait FALSE
parquet TRUE
json TRUE
s3 TRUE
gcs FALSE
utf8proc TRUE
re2 TRUE
snappy TRUE
gzip TRUE
brotli FALSE
zstd TRUE
lz4 TRUE
lz4_frame TRUE
lzo FALSE
bz2 FALSE
jemalloc FALSE
mimalloc TRUE
Memory:
Allocator mimalloc
Current 0 bytes
Max 0 bytes
Runtime:
SIMD Level none
Detected SIMD Level none
Build:
C++ Library Version 19.0.1
C++ Compiler AppleClang
C++ Compiler Version 14.0.0.14000029
# A tibble: 6 × 62
FlightDate Airline Origin Dest Cancelled Diverted CRSDepTime DepTime
<dttm> <chr> <chr> <chr> <lgl> <lgl> <int> <dbl>
1 2022-04-04 00:00:00 Commut… GJT DEN FALSE FALSE 1133 1123
2 2022-04-04 00:00:00 Commut… HRL IAH FALSE FALSE 732 728
3 2022-04-04 00:00:00 Commut… DRO DEN FALSE FALSE 1529 1514
4 2022-04-04 00:00:00 Commut… IAH GPT FALSE FALSE 1435 1430
5 2022-04-04 00:00:00 Commut… DRO DEN FALSE FALSE 1135 1135
6 2022-04-04 00:00:00 Commut… DEN TUL FALSE FALSE 955 952
# ℹ 54 more variables: DepDelayMinutes <dbl>, DepDelay <dbl>, ArrTime <dbl>,
# ArrDelayMinutes <dbl>, AirTime <dbl>, CRSElapsedTime <dbl>,
# ActualElapsedTime <dbl>, Distance <dbl>, Year <int>, Quarter <int>,
# Month <int>, DayofMonth <int>, DayOfWeek <int>,
# Marketing_Airline_Network <chr>,
# Operated_or_Branded_Code_Share_Partners <chr>,
# DOT_ID_Marketing_Airline <int>, IATA_Code_Marketing_Airline <chr>, …
read parquet zip ::: {.cell}
# A tibble: 6 × 62
FlightDate Airline Origin Dest Cancelled Diverted CRSDepTime DepTime
<dttm> <chr> <chr> <chr> <lgl> <lgl> <int> <dbl>
1 2022-04-04 00:00:00 Commut… GJT DEN FALSE FALSE 1133 1123
2 2022-04-04 00:00:00 Commut… HRL IAH FALSE FALSE 732 728
3 2022-04-04 00:00:00 Commut… DRO DEN FALSE FALSE 1529 1514
4 2022-04-04 00:00:00 Commut… IAH GPT FALSE FALSE 1435 1430
5 2022-04-04 00:00:00 Commut… DRO DEN FALSE FALSE 1135 1135
6 2022-04-04 00:00:00 Commut… DEN TUL FALSE FALSE 955 952
# ℹ 54 more variables: DepDelayMinutes <dbl>, DepDelay <dbl>, ArrTime <dbl>,
# ArrDelayMinutes <dbl>, AirTime <dbl>, CRSElapsedTime <dbl>,
# ActualElapsedTime <dbl>, Distance <dbl>, Year <int>, Quarter <int>,
# Month <int>, DayofMonth <int>, DayOfWeek <int>,
# Marketing_Airline_Network <chr>,
# Operated_or_Branded_Code_Share_Partners <chr>,
# DOT_ID_Marketing_Airline <int>, IATA_Code_Marketing_Airline <chr>, …
:::
# A tibble: 6 × 62
FlightDate Airline Origin Dest Cancelled Diverted CRSDepTime DepTime
<dttm> <chr> <chr> <chr> <lgl> <lgl> <int> <dbl>
1 2022-04-04 00:00:00 Commut… GJT DEN FALSE FALSE 1133 1123
2 2022-04-04 00:00:00 Commut… HRL IAH FALSE FALSE 732 728
3 2022-04-04 00:00:00 Commut… DRO DEN FALSE FALSE 1529 1514
4 2022-04-04 00:00:00 Commut… IAH GPT FALSE FALSE 1435 1430
5 2022-04-04 00:00:00 Commut… DRO DEN FALSE FALSE 1135 1135
6 2022-04-04 00:00:00 Commut… DEN TUL FALSE FALSE 955 952
# ℹ 54 more variables: DepDelayMinutes <dbl>, DepDelay <dbl>, ArrTime <dbl>,
# ArrDelayMinutes <dbl>, AirTime <dbl>, CRSElapsedTime <dbl>,
# ActualElapsedTime <dbl>, Distance <dbl>, Year <int>, Quarter <int>,
# Month <int>, DayofMonth <int>, DayOfWeek <int>,
# Marketing_Airline_Network <chr>,
# Operated_or_Branded_Code_Share_Partners <chr>,
# DOT_ID_Marketing_Airline <int>, IATA_Code_Marketing_Airline <chr>, …
Rows: 1
Columns: 11
$ glossary.title <chr> "ex…
$ glossary.GlossDiv.title <chr> "S"
$ glossary.GlossDiv.GlossList.GlossEntry.ID <chr> "SG…
$ glossary.GlossDiv.GlossList.GlossEntry.SortAs <chr> "SG…
$ glossary.GlossDiv.GlossList.GlossEntry.GlossTerm <chr> "St…
$ glossary.GlossDiv.GlossList.GlossEntry.Acronym <chr> "SG…
$ glossary.GlossDiv.GlossList.GlossEntry.Abbrev <chr> "IS…
$ glossary.GlossDiv.GlossList.GlossEntry.GlossDef.para <chr> "A …
$ glossary.GlossDiv.GlossList.GlossEntry.GlossDef.GlossSeeAlso..GML. <chr> "GM…
$ glossary.GlossDiv.GlossList.GlossEntry.GlossDef.GlossSeeAlso..XML. <chr> "XM…
$ glossary.GlossDiv.GlossList.GlossEntry.GlossSee <chr> "ma…
WOODCASK = @WOODCASKSHOP
www.woodcask.shop
Singapore All prices are in 10ml pours.
Price valid date: 22-23/2/2025
1D Name PRICE/1OML
"R1054_—-CARONI21Y01997888R60.4% ~Sst~CS~=“C~S*‘“s*~*~*~*~*~*~*S~SSC<C*‘;*‘«wRROCO
R1066A Long Pond 2005 stc<h>e ester 550/700 cambridge 62.5% ¥ 50.00
"R1063___ LONG POND 2005 HABITATION VELIER 14 YOEASTER1285.4%62% +¥ ~~ 60.00.
R1030 CARONI 23 YO 1994-2017 DOUBLE MATURATION GUYANA STOCK 57.18% ¥ 150.00
R1080 CARONI 15 YO SINGLE CASK#3767 LMDW 2000 -2015 68.4% ¥ 400.00
R1081 Long Pond Plantation Single Cask 1993 CRV 15 YO The Nectar 70cl 76,8% ¥ 50.00
R1073 New Yarmouth Jamaica 1994 26 YO Year Old Blackadder 66.8% ¥ 50.00
R1046 CARONI 17 YO VELIER EXTRA STRONG 110 PROOF ¥ 50.00
R1070 CARONI 23 YO 1997 SPHERIC SPIRITS 1ST FILL BOURBON BARREL#1143 62% ¥ 70.00
Hw s1005 GLEN GRANT 31yo 1970 2001 70cl 45% SAMAROLI Sherry Cask 1025 ¥ 200.00
WS1395 Longrow 1987 "The Last Bottling" Bottled 2007 45% CASK #116 ¥ 100.00
ws1402 Glen Scotia 1977 Prestonfield House 33 YO LMDW & The Nectar 49.1% ¥ 100.00
ws1304 SPRINGBANK 21 YO SINGLE CASK NEDERLAND OLOROSO SHERRY HOGSHEAD 48% ¥ 150.00
WS1396 Springbank 1993 Moon Import 35th Anniversary / Lawless 45% ¥ 100.00
ws1391 Springbank 1985 Brae Dean Dtm Sherry Wood Moon Import 50% Barrel 40/1-4 ¥ 200.00
ws1210 CLYNELISH 1990-2022 BOURBON BARREL#3477 LMDW ARTIST12 47.7% ¥ 100.00
WS1450 TALISKER 12 YO The Golden Spirit - John Walker & Sons 1980S BOT 43% ¥ 100.00
ws1165 GLENURY 21 YO 1980 OMC 50% ¥ 100.00
ws1279 GLEN KEITH 20 YO LMDW ARTIST#3 BOURBON BARREL#120560 1992-2013 60.1% ¥ 60.00
WS1278 GLENCRAIG 35 YO LMDW ARTIST#3 BOURBON BARREL#4257 1976-2013 43.7% ¥ 100.00
“WS1240 Blair Athol 1976 Cadenhead's 200 1976-1997478% ~~~ ~+¥ ~~~~-80.00°
WS1233 BENROMACH 28 YO 1965-1993 CADENHEADS AUTHENTIC COLLECTION 47.6% ¥ 120.00
ws1311 SPRINGBANK 22 YO 1992 THE BOTTLERS REFILL SHERRY HOGSHEAD#227 51.9% ¥ 150.00
ws1449 GLENDRONACH 25 YO 1993 -2019 OLOROSO SHERRY CASK#416 51.2% ¥ 60.00
ws1162 MALALLAN 2002-2022 TWE SPEYMALT CASK#9545 55.5% ¥ 60.00
WS1375, MACALLAN 12 YO LIC UTIF 935 MILANO 1980S 43% ¥ 100.00
ws1358 MACALLAN 8 YEAR OLD SHERRY WOOD 1970S 75CL 43% ¥ 200.00
wS1205 MACALLAN 11 YO SMWS 24.57 1988 55.9% ¥ 120.00
ws1200 GLENFARCLAS 29 YO 1970 SHERRYWOOD Cadenhead bot 2000 54% ¥ 150.00
ws1341 GLENROTHES 27 YO 1979-2006 RED WINE CASK FINISHED OMC 50% ¥ 120.00
raw_tibble001 <- text001 %>%
str_split(pattern = "\n") %>%
unlist() %>%
tibble(data = .) %>% slice(6:n()) %>% filter(str_length(data) >= 2) %>% mutate(old=data
,data=data %>%str_replace_all('_','')%>%str_replace_all('—','')%>%str_replace_all("'",'')%>%str_replace_all('"','')
) %>% separate(
data,
into = c("one", "price"),
sep = c("¥")
) %>% mutate(
id = str_extract(one, "^\\S+ ")
,name=gsub("^\\S+ ", "",one)
)
glimpse(raw_tibble001)
Rows: 30
Columns: 5
$ one <chr> "R1054-CARONI21Y01997888R60.4% ~Sst~CS~=“C~S*‘“s*~*~*~*~*~*~*S~…
$ price <chr> NA, " 50.00", " ~~ 60.00.", " 150.00", " 400.00", " 50.00", " 50…
$ old <chr> "\"R1054_—-CARONI21Y01997888R60.4% ~Sst~CS~=“C~S*‘“s*~*~*~*~*~*…
$ id <chr> "R1054-CARONI21Y01997888R60.4% ", "R1066A ", "R1063 ", "R1030 ",…
$ name <chr> " ~Sst~CS~=“C~S*‘“s*~*~*~*~*~*~*S~SSC<C*‘;*‘«wRROCO", "Long Pond…
Rda is just a short name for RData.Rds stores a single R object.RData can store multiple R objects in a single file.
$page
width height
8.5 11.0
$landscape
[1] FALSE
$margins
top bottom left right header footer
1.0000000 1.0000000 0.7875000 0.9201389 0.4916667 0.4916667
doc_index content_type
1 1 paragraph
2 2 paragraph
3 3 paragraph
4 4 paragraph
5 5 paragraph
6 6 paragraph
7 7 paragraph
8 8 paragraph
9 9 paragraph
10 10 paragraph
11 11 paragraph
12 12 paragraph
13 13 paragraph
14 14 paragraph
15 15 paragraph
16 16 paragraph
17 17 paragraph
18 18 paragraph
19 19 paragraph
20 20 paragraph
21 21 paragraph
22 22 paragraph
23 23 paragraph
24 24 paragraph
25 25 paragraph
26 26 paragraph
27 27 paragraph
28 28 paragraph
29 29 paragraph
30 30 paragraph
31 31 paragraph
32 32 paragraph
33 33 paragraph
34 34 paragraph
35 35 paragraph
36 36 paragraph
37 37 paragraph
38 38 paragraph
39 39 paragraph
40 40 paragraph
41 41 paragraph
42 42 paragraph
43 43 paragraph
44 44 paragraph
45 45 paragraph
46 46 paragraph
47 47 paragraph
48 48 paragraph
49 49 paragraph
50 50 paragraph
51 51 paragraph
52 52 paragraph
53 53 paragraph
54 54 paragraph
55 55 paragraph
56 56 paragraph
57 57 paragraph
58 58 paragraph
59 59 paragraph
60 60 paragraph
61 61 paragraph
62 62 paragraph
63 63 paragraph
64 64 paragraph
65 65 paragraph
66 66 paragraph
67 67 paragraph
68 68 paragraph
1.1 69 table cell
1.6 69 table cell
2.2 69 table cell
2.7 69 table cell
3.3 69 table cell
3.8 69 table cell
4.4 69 table cell
4.9 69 table cell
5.5 69 table cell
5.10 69 table cell
110 70 paragraph
111 71 paragraph
112 72 paragraph
113 73 paragraph
114 74 paragraph
115 75 paragraph
116 76 paragraph
117 77 paragraph
118 78 paragraph
119 79 paragraph
120 80 paragraph
121 81 paragraph
122 82 paragraph
123 83 paragraph
124 84 paragraph
125 85 paragraph
126 86 paragraph
127 87 paragraph
128 88 paragraph
129 89 paragraph
style_name
1 heading 1
2 <NA>
3 heading 1
4 <NA>
5 <NA>
6 <NA>
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 Style Heading 2 + Not Italic Before: 0 pt After: 0 pt Line spa...
12 <NA>
13 <NA>
14 <NA>
15 <NA>
16 <NA>
17 <NA>
18 <NA>
19 heading 3
20 <NA>
21 <NA>
22 <NA>
23 <NA>
24 <NA>
25 <NA>
26 <NA>
27 <NA>
28 <NA>
29 <NA>
30 <NA>
31 <NA>
32 <NA>
33 <NA>
34 <NA>
35 <NA>
36 <NA>
37 <NA>
38 <NA>
39 heading 2
40 <NA>
41 <NA>
42 <NA>
43 <NA>
44 <NA>
45 <NA>
46 heading 2
47 <NA>
48 heading 2
49 heading 2
50 <NA>
51 <NA>
52 <NA>
53 <NA>
54 <NA>
55 <NA>
56 heading 2
57 heading 2
58 <NA>
59 <NA>
60 heading 2
61 <NA>
62 <NA>
63 <NA>
64 <NA>
65 <NA>
66 <NA>
67 <NA>
68 <NA>
1.1 <NA>
1.6 <NA>
2.2 <NA>
2.7 <NA>
3.3 <NA>
3.8 <NA>
4.4 <NA>
4.9 <NA>
5.5 <NA>
5.10 <NA>
110 heading 2
111 heading 2
112 <NA>
113 <NA>
114 <NA>
115 <NA>
116 <NA>
117 <NA>
118 Style Heading 2 + Not Italic
119 Style Heading 2 + Not Italic
120 Style Heading 2 + Not Italic
121 <NA>
122 <NA>
123 <NA>
124 <NA>
125 <NA>
126 <NA>
127 heading 2
128 <NA>
129 <NA>
text
1 Word Documents Template
2
3 Main heading:
4 Use the Heading 1 style for primary headings so that screen readers can identify them as such.
5 If not already, manually change your heading 1 style to be:
6 - sans serif (e.g. Arial, Verdana, Trebuchet or Calibri),
7 - 16 pt, and
8 - Bold
9 Then set this formatting as your default for this style.
10
11 Sub Headings:
12 Use Heading 2 style for sub headings.
13 If not already, manually change your heading 2 style to be:
14 - sans serif (e.g. Arial, Verdana, Trebuchet or Calibri),
15 - 14 pt, and
16 - Bold
17 Then set this formatting as your default for this style.
18
19 Sub Sub Headings:
20 Use Heading 3 for sub sub-headings.
21 If not already, manually change your heading 2 style to be:
22 - sans serif (e.g. Arial, Verdana, Trebuchet or Calibri),
23 - 12 pt, and
24 - Bold
25 Then set this formatting as your default for this style.
26
27 Paragraph
28 Paragraphs should not be styled as headings. Paragraphs should be ‘normal’ style.
29 They should be:
30 Sans serif font, 12 point
31 1.5 spacing (except for lists of bullet points)
32 Left aligned instead of justified
33 Then set this formatting as your default for this style.
34
35 Your document should also:
36 Leave sufficient white space at either side of the page
37 Avoid using block capitals or italics. Use bold to make text stand out instead.
38
39 To amend default styles:
40
41 Amend the style in line with the above guidelines then right click the style in question under the home tab. Choose ‘modify’ from the drop down list. This will open a box.
42
43 Within the box, ensure that the style is formatted to your preferences. For example, if ‘italics’ is checked, uncheck it.
44
45 Choose the radio button that states: ‘New documents based on this template’, and click ‘okay’.
46
47
48
49 To amend paragraph defaulting:
50
51 Left click ‘paragraph’ under the home tab.
52
53 Ensure your alignment is set to ‘left’ and line spacing is set to ‘1.5 lines’.
54 Once your settings are correct click ‘default’.
55 Click ‘yes’ on the resulting ‘Are your sure’ message.
56
57 To test your new settings
58 Open a new document and test each heading and paragraph style to ensure all settings have been saved.
59
60 Table Usage
61 Construct tables to read logically from left to right, top to bottom order.
62 Tables with column headings in the top row must have the top row formatted as a header row. To set a table header row:
63 Highlight the top row of the table
64 Right click to display editing options
65 Select “Table Properties” from the list.
66
67 The Table Properties window will be displayed; click on the “Row” tab
68 Check the option “Repeat as header at the top of each page”
1.1
1.6 Row 1
2.2 Col 1
2.7
3.3 Col 2
3.8
4.4 Col 3
4.9
5.5 Col 4
5.10
110
111 Images
112 Alternative or Alt text is required for all images in a document (excluding purely decorative images without meaningful content).
113 Right-click on the image;
114 Select Format Picture.
115 The Format Picture dialog box will appear. Select the Web tab.
116 In the Alternative text box, type in the description of the image.
117 Click “OK”.
118
119
120 Rationale for following Clear Print Guidelines
121 Example A:
122 Example A is Times New Roman, size ten, with single spacing. Example B is Arial, size twelve with 1.5 spacing. As you can see, smaller font sizes, single spacing and serif fonts are harder to read. Additionally, it is easier to keep one’s place on a page with left aligned text, as in example B, as left alignment gives the body of the text a specific shape and gives uniformity between words. Example A, which is justified, has no natural shape. Furthermore, bold print stands out, and does not distort the shape of text as italics and underlining do. FINALLY, BLOCK CAPITALS CAN BE DIFFICULT TO FOLLOW AS BLOCK CAPITALS REMOVE THE NATURAL SHAPE OF WORDS, TURNING THEM INTO BLOCKS. Clear layout allows one to focus on the content of visual materials rather than the format.
123
124 Example B:
125 Example A is Times New Roman, size ten, with single spacing. Example B is Arial, size twelve with 1.5 spacing. As you can see, smaller font sizes, single spacing and serif fonts are harder to read. Additionally, it is easier to keep one’s place on a page with left aligned text, as in example B, as left alignment gives the body of the text a specific shape and gives uniformity between words. Example A, which is justified, has no natural shape. Furthermore, bold print stands out, and does not distort the shape of text as italics and underlining do. Finally, block capitals can be difficult to follow as block capitals remove the natural shape of words, turning them into blocks. Clear layout allows one to focus on the content of visual materials rather than the format.
126
127 Furthermore
128 If you use headings it makes the creation and upkeep of tables of contents easier (For automatic creation and updating go to: Insert – Reference – Index and Tables – Table of contents).
129
level num_id row_id is_header cell_id col_span row_span
1 NA NA NA NA NA NA NA
2 NA NA NA NA NA NA NA
3 NA NA NA NA NA NA NA
4 NA NA NA NA NA NA NA
5 NA NA NA NA NA NA NA
6 NA NA NA NA NA NA NA
7 NA NA NA NA NA NA NA
8 NA NA NA NA NA NA NA
9 NA NA NA NA NA NA NA
10 NA NA NA NA NA NA NA
11 NA NA NA NA NA NA NA
12 NA NA NA NA NA NA NA
13 NA NA NA NA NA NA NA
14 NA NA NA NA NA NA NA
15 NA NA NA NA NA NA NA
16 NA NA NA NA NA NA NA
17 NA NA NA NA NA NA NA
18 NA NA NA NA NA NA NA
19 NA NA NA NA NA NA NA
20 NA NA NA NA NA NA NA
21 NA NA NA NA NA NA NA
22 NA NA NA NA NA NA NA
23 NA NA NA NA NA NA NA
24 NA NA NA NA NA NA NA
25 NA NA NA NA NA NA NA
26 NA NA NA NA NA NA NA
27 NA NA NA NA NA NA NA
28 NA NA NA NA NA NA NA
29 NA NA NA NA NA NA NA
30 1 1 NA NA NA NA NA
31 1 1 NA NA NA NA NA
32 1 1 NA NA NA NA NA
33 NA NA NA NA NA NA NA
34 NA NA NA NA NA NA NA
35 NA NA NA NA NA NA NA
36 1 1 NA NA NA NA NA
37 1 1 NA NA NA NA NA
38 NA NA NA NA NA NA NA
39 NA NA NA NA NA NA NA
40 NA NA NA NA NA NA NA
41 NA NA NA NA NA NA NA
42 NA NA NA NA NA NA NA
43 NA NA NA NA NA NA NA
44 NA NA NA NA NA NA NA
45 NA NA NA NA NA NA NA
46 NA NA NA NA NA NA NA
47 NA NA NA NA NA NA NA
48 NA NA NA NA NA NA NA
49 NA NA NA NA NA NA NA
50 NA NA NA NA NA NA NA
51 NA NA NA NA NA NA NA
52 NA NA NA NA NA NA NA
53 NA NA NA NA NA NA NA
54 NA NA NA NA NA NA NA
55 NA NA NA NA NA NA NA
56 NA NA NA NA NA NA NA
57 NA NA NA NA NA NA NA
58 NA NA NA NA NA NA NA
59 NA NA NA NA NA NA NA
60 NA NA NA NA NA NA NA
61 NA NA NA NA NA NA NA
62 NA NA NA NA NA NA NA
63 1 2 NA NA NA NA NA
64 1 2 NA NA NA NA NA
65 1 2 NA NA NA NA NA
66 NA NA NA NA NA NA NA
67 NA NA NA NA NA NA NA
68 NA NA NA NA NA NA NA
1.1 NA NA 1 TRUE 1 1 1
1.6 NA NA 2 FALSE 1 1 1
2.2 NA NA 1 TRUE 2 1 1
2.7 NA NA 2 FALSE 2 1 1
3.3 NA NA 1 TRUE 3 1 1
3.8 NA NA 2 FALSE 3 1 1
4.4 NA NA 1 TRUE 4 1 1
4.9 NA NA 2 FALSE 4 1 1
5.5 NA NA 1 TRUE 5 1 1
5.10 NA NA 2 FALSE 5 1 1
110 NA NA NA NA NA NA NA
111 NA NA NA NA NA NA NA
112 NA NA NA NA NA NA NA
113 1 3 NA NA NA NA NA
114 1 3 NA NA NA NA NA
115 1 3 NA NA NA NA NA
116 1 3 NA NA NA NA NA
117 1 3 NA NA NA NA NA
118 NA NA NA NA NA NA NA
119 NA NA NA NA NA NA NA
120 NA NA NA NA NA NA NA
121 NA NA NA NA NA NA NA
122 NA NA NA NA NA NA NA
123 NA NA NA NA NA NA NA
124 NA NA NA NA NA NA NA
125 NA NA NA NA NA NA NA
126 NA NA NA NA NA NA NA
127 NA NA NA NA NA NA NA
128 NA NA NA NA NA NA NA
129 NA NA NA NA NA NA NA
filter table section
row_id cell_id text
1.6 2 1 Row 1
2.7 2 2
3.8 2 3
4.9 2 4
5.10 2 5
transform the table data into a data frame
# The content of the table is in 'long' format, but we can 're-rectangularise'
table_names <- table_cells[table_cells$is_header, "text"]
table_content <- table_cells[!table_cells$is_header, "text"]
row_count <- nrow(table_cells) / nrow(table_cells[table_cells$is_header, ]) - 1
table_mat <- as.data.frame(matrix(table_content, nrow = row_count))
names(table_mat) <- table_names
table_mat
Col 1 Col 2 Col 3 Col 4
1 Row 1
output to zip format
---
title: "input & ouput in R"
author: "Tony Duan"
execute:
warning: false
error: false
format:
html:
toc: true
toc-location: right
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
Data input and ouput in R
{width="500"}
```{r}
library(tidyverse)
library(openxlsx)
library(readxl)
```
# Input
## read CSV
```{r}
data001=read_csv('data/Book3.csv')
head(data001)
```
read CSV online
```{r}
url='https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv'
data001=read_csv(url)
head(data001)
```
## read excel
```{r}
library(openxlsx)
library(readxl)
data001=read_excel('data/Book1.xlsx')
head(data001)
```
## read parquet
must install in this way,otherwise will report error.
```{r}
#| eval: false
install.packages("arrow", repos = c("https://apache.r-universe.dev"))
```
```{r}
library(arrow)
arrow_info()
```
```{r}
data001=read_parquet('data/df.parquet')
head(data001)
```
read parquet zip
```{r}
data001=read_parquet('data/df.parquet.gzip')
head(data001)
```
## read feather
```{r}
library(feather)
data001=read_feather('data/feather_file.feather')
head(data001)
```
## read JSON file and convert into data frame
```{r}
library(jsonlite)
data=read_json("./data/dataj.json")
```
```{r}
data002=(data) %>% as.data.frame()
```
```{r}
glimpse(data002)
```
## read Data from Images
```{r}
library(tesseract)
```
```{r}
eng <- tesseract("eng")
text001 <- tesseract::ocr("640.png", engine = eng)
```
```{r}
cat(text001)
```
```{r}
raw_tibble001 <- text001 %>%
str_split(pattern = "\n") %>%
unlist() %>%
tibble(data = .) %>% slice(6:n()) %>% filter(str_length(data) >= 2) %>% mutate(old=data
,data=data %>%str_replace_all('_','')%>%str_replace_all('—','')%>%str_replace_all("'",'')%>%str_replace_all('"','')
) %>% separate(
data,
into = c("one", "price"),
sep = c("¥")
) %>% mutate(
id = str_extract(one, "^\\S+ ")
,name=gsub("^\\S+ ", "",one)
)
glimpse(raw_tibble001)
```
## read data from RData/Rds
Rda is just a short name for RData.Rds stores a single R object.RData can store multiple R objects in a single file.
### read one data
```{r}
#| eval: false
my_data <- readRDS("mtcars.rds")
```
### read multiple data
```{r}
#| eval: false
load("my_work_space.RData")
```
## read doxc
```{r}
pak::pkg_install('officer')
library(officer)
docx=read_docx("data/example03.docx")
#docx
```
```{r}
docx_dim(docx)
```
```{r}
docx_summary(docx)
```
filter table section
```{r}
table_cells=docx_summary(docx) %>% filter(content_type == "table cell")
table_data <- table_cells %>% filter(!is_header) %>% select(row_id, cell_id, text)
table_data
```
transform the table data into a data frame
```{r}
# The content of the table is in 'long' format, but we can 're-rectangularise'
table_names <- table_cells[table_cells$is_header, "text"]
table_content <- table_cells[!table_cells$is_header, "text"]
row_count <- nrow(table_cells) / nrow(table_cells[table_cells$is_header, ]) - 1
table_mat <- as.data.frame(matrix(table_content, nrow = row_count))
names(table_mat) <- table_names
table_mat
```
## read txt
```{r}
my_data <- read.delim("text.txt")
my_data
```
# Output
## write csv
```{r}
write.csv(data001,'data001 csv output data.csv')
```
## write excel
```{r}
library(openxlsx)
library(readxl)
write.xlsx(data001,'data001 excel output data.xlsx')
```
## write parquet
```{r}
library(arrow)
write_parquet(data001,'data/df.parquet')
```
output to zip format
```{r}
write_parquet(data001,'data/df.parquet.gzip',compression='gzip')
```
## write feather
```{r}
library(feather)
write_feather(data001,'data/feather_file.feather')
```
## write txt
```{r}
text=tibble('hello world
its time!')
write_delim(text, "text.txt")
```
## write to RData/Rds
### write one data
```{r}
# Save a single object to a file
saveRDS(mtcars, "mtcars.rds")
```
### write multiple data
```{r}
# Save a single object to a file
save(mtcars,iris, file="mtcars_and_iris.RData")
```
### write all working space data
```{r}
# Save a single object to a file
save.image(file = "my_work_space.RData")
```